예제 #1
0
def main():
    print("Getting features for deleted papers from the database")
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1,
                                        max_features=None)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier, prefix="forest_")
예제 #2
0
def main():
    print("Getting features for deleted papers from the database")
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))]
    
    featuresInts = []
    for tup in features:
        a, b, c, d, e = tup
        featuresInts.append((int(a), int(b), int(c), int(d), int(e)))

   
    trainSet = zip(featuresInts, target)
    

   
    N = 5          #N : number of inputs/neurons for input layer
    H1 = 100       #H : number of neurons in hidden layer-1
    #H2 = 5
    M = 1           #number of outputs/neurons of the output layer
    
    learningRate = 0.1
    epochs =  1000
    
    #define layers of MLP keeping in mind that output of one layer is the number of inputs for the next layer
    layer0 = Layer(nNeurons=N, nInpsPerNeuron=-1, transferF='identity', ilayer=0, seed=13)           #input layer
    layer1 = Layer(nNeurons=H1, nInpsPerNeuron=N, transferF='tanh', ilayer=1, seed=13)                #hidden layer 1
    layer2 = Layer(nNeurons=M, nInpsPerNeuron=H1, transferF='tanh', ilayer=2, seed=13)                #output layer 
    #layer3 = Layer(nNeurons=M, nInpsPerNeuron=H2, transferF='logistic', ilayer=3)            #output layer
    
    layers = [layer0, layer1, layer2 ]
    
    mlp = Mlp(layers)
    mlp.showMlp()
    print "\n\nTraining  Mlp for", epochs," Epochs.... please wait... "   
    trainedMlp, iterations = mlp.trainMlp(trainSet, learningRate, epochs)
    print "\n\nFinished training of Mlp "
    trainedMlp.showMlp()
    
    print("Saving the classifier")
    data_io.save_model(mlp,prefix="mlp_")
예제 #3
0
def train():
    ninp = 5
    nhidden = 10
    noutput = 1
    inpDim = 5
    targetDim = 1

    net = buildNetwork(ninp, nhidden, noutput, bias=True)
    ds = SupervisedDataSet(inpDim, targetDim)

    print("Getting features for deleted papers from the database")
    features_deleted = None
    features_conf = None
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [[0] for x in range(len(features_deleted))
              ] + [[1] for x in range(len(features_conf))]

    featuresInts = []
    for tup in features:
        a, b, c, d, e = tup
        featuresInts.append((int(a), int(b), int(c), int(d), int(e)))

    trainset = zip(featuresInts, target)

    for x, y in trainset:
        ds.addSample(x, y)

    print "training..."
    trainer = BackpropTrainer(net, ds)
    trainer.trainUntilConvergence()
    with open("net_pybrain.obj", 'w') as dumpfile:
        cPickle.dump(net, dumpfile, cPickle.HIGHEST_PROTOCOL)
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()
    print classifier.feature_importances_

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
def main():
    print "Getting features for valid papers from the database"
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print "Loading the classifier"
    classifier = data_io.load_model()

    print "Making predictions"
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print "Writing predictions to file"
    data_io.write_submission(paper_predictions)
예제 #6
0
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    featuresfloat = []
    for tup in features:
        a, b, c, d, e = tup
        featuresfloat.append(
            (float(a), float(b), float(c), float(d), float(e)))
    print("Totoal number of samples: ", len(featuresfloat))

    print("Loading the logistic regression model")
    logistic = data_io.load_model()

    print("Making predictions")
    predictions = logistic.predict_proba(featuresfloat)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
예제 #7
0
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    featuresfloat = []
    for tup in features:
       a, b, c, d, e = tup
       featuresfloat.append((float(a), float(b), float(c), float(d), float(e)))
    print("Totoal number of samples: ", len(featuresfloat))

    print("Loading the logistic regression model")
    logistic = data_io.load_model()

    print("Making predictions")
    predictions = logistic.predict_proba(featuresfloat)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
예제 #8
0
파일: predict.py 프로젝트: pratapbhanu/misc
def main():
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
예제 #9
0
파일: predict.py 프로젝트: pratapbhanu/misc
def main():
    print("Getting features for valid papers from the database")
    if (os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
예제 #10
0
def train():
    ninp = 5
    nhidden = 10
    noutput = 1
    inpDim = 5
    targetDim = 1
    
    net = buildNetwork(ninp, nhidden, noutput, bias=True)
    ds = SupervisedDataSet(inpDim, targetDim)
    
    print("Getting features for deleted papers from the database")
    features_deleted = None; features_conf = None
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))]
    
    featuresInts = []
    for tup in features:
        a, b, c, d, e = tup
        featuresInts.append((int(a), int(b), int(c), int(d), int(e)))
    
    trainset = zip(featuresInts, target)
    
    for x, y in trainset:
        ds.addSample(x, y)
    
    print "training..."
    trainer = BackpropTrainer(net, ds)
    trainer.trainUntilConvergence()
    with open("net_pybrain.obj", 'w') as dumpfile:
        cPickle.dump(net, dumpfile, cPickle.HIGHEST_PROTOCOL)
def main():    
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier)
def main():
    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    features = [x[2:] for x in features_conf + features_deleted]
    target = [0 for x in range(len(features_conf))] + [1 for x in range(len(features_deleted))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier)
예제 #13
0
def main():
    print("Getting features for deleted papers from the database")
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1,
                                        max_features=None)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier, prefix="forest_")
예제 #14
0
def main():
    
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]
    
    predictInts = []
    for tup in features:
        a, b, c, d, e = tup
        predictInts.append((int(a), int(b), int(c), int(d), int(e)))

    print("Loading the classifier")
    mlp = data_io.load_model(prefix="mlp_")

    print("Making predictions")
    predictions = []
    for x in predictInts : 
        #Propagate the inputs forward to compute the outputs             
        outp = list(x)     #output of  input layer i.e. output of previous layer to be used as input for next layer
        for layer in mlp.layers[1:] :           #for all layers starting from the second layer
            for i in range(layer.nNeurons):
                layer.net[i] =  weightedSum(outp, layer.W[1:,i]) + layer.W[0,i]
                layer.out[i] = g(layer.net[i], layer.transferF)   #pass this weighted sum through the transfer function of this layer                  
                outp = layer.out  
        predictions.append(mlp.layers[-1].out[0])

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="mlp_")
예제 #15
0
파일: predict.py 프로젝트: pratapbhanu/misc
def main():
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]
    
    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_additional_features()    
    _, _, kw_features = all_features    
    for i in range(len(features)):
        features[i]+= tuple(kw_features[i][2:])
    
    featuresnp = np.array(features, dtype='int32')
        
#    featuresnp -= np.mean(featuresnp, axis=0)
#    featuresnp /= np.std(featuresnp, axis=0)
    
    
    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(featuresnp)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
예제 #16
0
from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.naive_bayes import BernoulliNB
import os
import cPickle
import data_io
from sklearn import manifold

print("Getting features for deleted papers from the database")
if (os.path.exists("features_deleted.obj")):
    with open("features_deleted.obj", 'r') as loadfile:
        features_deleted = cPickle.load(loadfile)
else:
    features_deleted = data_io.get_features_db("TrainDeleted")
    with open("features_deleted.obj", 'w') as dumpfile:
        cPickle.dump(features_deleted,
                     dumpfile,
                     protocol=cPickle.HIGHEST_PROTOCOL)

print("Getting features for confirmed papers from the database")
if (os.path.exists("features_confirmed.obj")):
    with open("features_confirmed.obj", 'r') as loadfile:
        features_conf = cPickle.load(loadfile)
else:
    features_conf = data_io.get_features_db("TrainConfirmed")
    with open("features_confirmed.obj", 'w') as dumpfile:
        cPickle.dump(features_conf,
                     dumpfile,
                     protocol=cPickle.HIGHEST_PROTOCOL)
'''
Created on Jun 11, 2013

@author: navin.kolambkar
'''
from __future__ import division
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import data_io

if __name__ == '__main__':
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]
    
    X = np.array(features)
    y = np.array(target)
    
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    featuresTest = [x[2:] for x in data]
    
'''
Created on Jun 11, 2013

@author: navin.kolambkar
'''
from __future__ import division
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import data_io

if __name__ == '__main__':
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    X = np.array(features)
    y = np.array(target)

    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    featuresTest = [x[2:] for x in data]
예제 #19
0
def test_mlp(learning_rate=0.013, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50, n_hidden2=10):
    """
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
    
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
    
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
    
       """
    np.random.seed(17)
    print ("Getting features for deleted papers from the database")
    features_deleted = None
    features_conf = None
    if os.path.exists("features_deleted.obj"):
        with open("features_deleted.obj", "r") as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", "w") as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print ("Getting features for confirmed papers from the database")
    if os.path.exists("features_confirmed.obj"):
        with open("features_confirmed.obj", "r") as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", "w") as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print ("Getting features for valid papers from the database")
    if os.path.exists("features_valid.obj"):
        with open("features_valid.obj", "r") as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", "w") as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    author_paper_ids = [x[:2] for x in data]
    features_valid = [x[2:] for x in data]

    features_validnp = np.array(features_valid, dtype="float64")

    #        predictInts = []
    #        for tup in features_valid:
    #           a, b, c, d, e = tup
    #           predictInts.append((int(a), int(b), int(c), int(d), int(e)))
    #
    #        predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32')
    #        for i, tup in enumerate(predictInts):
    #            a, b, c, d, e = tup
    #            predictsMat[i, 0] = a;  predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e;
    predict_set_x = theano.shared(features_validnp, borrow=True)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    featuresnp = np.array(features, dtype="float64")
    targetnp = np.array(target, dtype="int32")

    featuresnp -= np.mean(featuresnp, axis=0)
    featuresnp /= np.std(featuresnp, axis=0)

    cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0)
    for train, test in cv:
        train_set_x = theano.shared(featuresnp[train], borrow=True)
        test_set_x = theano.shared(featuresnp[test], borrow=True)
        train_set_y = theano.shared(targetnp[train], borrow=True)
        test_set_y = theano.shared(targetnp[test], borrow=True)

    batch_size = 20  # size of the minibatch

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print "... building the model"

    # allocate symbolic variables for the data

    #        size = T.lscalar()
    index = T.lscalar()
    x = T.matrix("x", dtype="float64")  # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix
    y = T.ivector("y")  # the labels are presented as 1D vector of

    # [int] labels

    rng = np.random.RandomState(113)

    # construct the MLP class
    classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2, n_hidden2=10)

    cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr

    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size : (index + 1) * batch_size],
            y: test_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    predict_model = theano.function(inputs=[], outputs=classifier.predictions(), givens={x: predict_set_x})

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a dictionary
    updates = OrderedDict()
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(classifier.params, gparams):
        updates[param] = param - learning_rate * gparam

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    ###############
    # TRAIN MODEL #
    ###############
    print "... training"

    # early-stopping parameters
    patience = 1000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.0995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.0
    start_time = time.clock()

    epoch = 0
    done_looping = False

    best_params = None
    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        training_cost = []
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            training_cost.append(minibatch_avg_cost)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [test_model(i) for i in xrange(n_test_batches)]
                this_validation_loss = np.mean(validation_losses)

                print (
                    "epoch %i, minibatch %i/%i, validation error %f %%"
                    % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0)
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    best_params = classifier.params

            if best_validation_loss < 0.005:
                done_looping = True
                print "Best Validation cost: ", best_validation_loss
                break

        mean_cost = np.mean(training_cost)
        print "Epoch ", epoch, " training cost: ", mean_cost

    end_time = time.clock()
    print (
        (
            "Optimization complete. Best validation score of %f %% "
            "obtained at iteration %i, with test performance %f %%"
        )
        % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)
    )
    print >> sys.stderr, (
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )

    print ("Saving the mlp best params")
    data_io.save_model(best_params, prefix="theano_")

    ############################
    # Making Predictions
    ############################

    print ("Making predictions")
    predictions = predict_model()  # classifier.predict_proba(features_valid)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print ("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="theano_")
예제 #20
0
def main():
    print("Getting features for deleted papers from the database")
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_keywords_feature()
    kw_deleted, kw_confirmed, _ = all_features
    kw_features = kw_deleted + kw_confirmed
    for i in range(len(features)):
        _, _, ckw = kw_features[i]
        features[i] += (ckw, )

    featuresnp = np.array(features, dtype='float32')
    targetnp = np.array(target, dtype='int32')

    featuresnp -= np.mean(featuresnp, axis=0)
    featuresnp /= np.std(featuresnp, axis=0)

    # Set the parameters by cross-validation
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(featuresnp,
                                                        targetnp,
                                                        test_size=0.3,
                                                        random_state=0)

    tuned_parameters = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    }, {
        'kernel': ['linear'],
        'C': [1, 10, 100, 1000]
    }]

    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(C=1),
                           tuned_parameters,
                           cv=4,
                           score_func=score,
                           n_jobs=4,
                           verbose=2)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_estimator_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.cv_scores_:
            print("%0.3f (+/-%0.03f) for %r" %
                  (mean_score, scores.std() / 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()

        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
예제 #21
0
파일: train.py 프로젝트: pratapbhanu/misc
def main():
    print("Getting features for deleted papers from the database")
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_additional_features()
    kw_deleted, kw_confirmed, _ = all_features
    kw_features = kw_deleted + kw_confirmed
    for i in range(len(features)):
        features[i] += tuple(kw_features[i][2:])

    #Simple K-Fold cross validation. 10 folds.
    #cv = cross_validation.KFold(len(features), n_folds=5)
    cv = cross_validation.ShuffleSplit(len(features),
                                       n_iter=4,
                                       test_size=0.4,
                                       random_state=0)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=1,
                                        random_state=0,
                                        compute_importances=True)

    featuresnp = np.array(features, dtype='int32')
    targetnp = np.array(target, dtype='int32')

    #    with open("wrong_predictions.txt", 'w' ) as wp:
    #        class1count = 0; class2count =0; rpredictions = 0
    #        for train, test in cv:
    #            x_train = featuresnp[train];        y_train = targetnp[train]
    #            x_test = featuresnp[test];         y_test = targetnp[test]
    #            classifier.fit(x_train, y_train)
    #            predictions = classifier.predict_proba(x_test)
    #            pred_classes = classifier.predict(x_test)
    #            for i in range(len(y_test)):
    #
    #                if y_test[i] != pred_classes[i] :
    #                    if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6):
    #                        class1count+=1;
    #                    if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6):
    #                        class2count+=1;
    #                    line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"])
    #                    wp.write(line)
    #                else:
    #                    if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6):
    #                        rpredictions+=1;
    #
    #        print "number of wrong predictions of deleted class: ", class1count
    #        print "number of wrong predictions of confirmed class: ", class2count
    #        print "number of right predictions with close probas", rpredictions
    #        for train, test in cv:
    #            print "total number of test examples: ", len(test)

    #    classifier.fit(featuresnp, targetnp)
    #    importances = classifier.feature_importances_
    ##    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
    ##                 axis=0)
    #    indices = np.argsort(importances)[::-1]
    #
    #    # Print the feature ranking
    #    print("Feature ranking:")
    #
    #    for f in range(len(indices)):
    #        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    #
    #    numFeatures = 15
    #    prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32")
    #    for i in range(prunedFeatures.shape[0]):
    #        for j, fi in enumerate(indices[0:numFeatures]):
    #            prunedFeatures[i,j] = featuresnp[i, fi]

    #    featuresnp -= np.mean(featuresnp, axis=0)
    #    featuresnp /= np.std(featuresnp, axis=0)

    #
    results = cross_validation.cross_val_score(classifier,
                                               X=featuresnp,
                                               y=targetnp,
                                               cv=cv,
                                               n_jobs=4,
                                               verbose=True)
    #print out the mean of the cross-validated results
    print "Results: ", results
    print "Results: " + str(np.array(results).mean())
예제 #22
0
파일: train.py 프로젝트: pratapbhanu/misc
def main():
    print("Getting features for deleted papers from the database")
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]
    
    
    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_additional_features()    
    kw_deleted, kw_confirmed, _ = all_features
    kw_features = kw_deleted+kw_confirmed
    for i in range(len(features)):
        features[i]+= tuple(kw_features[i][2:])
 
 
    #Simple K-Fold cross validation. 10 folds.
    #cv = cross_validation.KFold(len(features), n_folds=5)
    cv = cross_validation.ShuffleSplit(len(features), n_iter=4, test_size=0.4, random_state=0)
    
    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=1,
                                        random_state=0, 
                                        compute_importances=True                                        
                                        )
 

    featuresnp = np.array(features, dtype='int32')
    targetnp = np.array(target, dtype='int32')
    
#    with open("wrong_predictions.txt", 'w' ) as wp:
#        class1count = 0; class2count =0; rpredictions = 0
#        for train, test in cv:
#            x_train = featuresnp[train];        y_train = targetnp[train]
#            x_test = featuresnp[test];         y_test = targetnp[test]
#            classifier.fit(x_train, y_train)
#            predictions = classifier.predict_proba(x_test)
#            pred_classes = classifier.predict(x_test)
#            for i in range(len(y_test)):
#            
#                if y_test[i] != pred_classes[i] :
#                    if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6):
#                        class1count+=1;
#                    if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6):
#                        class2count+=1;
#                    line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"])
#                    wp.write(line)
#                else:
#                    if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6):
#                        rpredictions+=1;
#                    
#        print "number of wrong predictions of deleted class: ", class1count
#        print "number of wrong predictions of confirmed class: ", class2count
#        print "number of right predictions with close probas", rpredictions
#        for train, test in cv:
#            print "total number of test examples: ", len(test)
        

#    classifier.fit(featuresnp, targetnp)
#    importances = classifier.feature_importances_
##    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
##                 axis=0)
#    indices = np.argsort(importances)[::-1]
#    
#    # Print the feature ranking
#    print("Feature ranking:")
#    
#    for f in range(len(indices)):
#        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
#
#    numFeatures = 15   
#    prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32")
#    for i in range(prunedFeatures.shape[0]):
#        for j, fi in enumerate(indices[0:numFeatures]):
#            prunedFeatures[i,j] = featuresnp[i, fi]  
            

#    featuresnp -= np.mean(featuresnp, axis=0)
#    featuresnp /= np.std(featuresnp, axis=0)

#       
    results = cross_validation.cross_val_score(classifier, X=featuresnp, y=targetnp, cv=cv, n_jobs=4, verbose=True)
    #print out the mean of the cross-validated results
    print "Results: ", results
    print "Results: " + str( np.array(results).mean())
예제 #23
0
def test_mlp(learning_rate=0.013, L1_reg=0.00, L2_reg=0.0003, n_epochs=300,
                          n_hidden=200):
        """
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
    
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
    
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
    
       """
        np.random.seed(0)  
        print("Getting features for deleted papers from the database")
        features_deleted = None; features_conf = None
        if(os.path.exists("features_deleted.obj")):
            with open("features_deleted.obj", 'r') as loadfile:
                features_deleted = cPickle.load(loadfile)
        else:
            features_deleted = data_io.get_features_db("TrainDeleted")
            with open("features_deleted.obj", 'w') as dumpfile:
                cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    
        print("Getting features for confirmed papers from the database")
        if(os.path.exists("features_confirmed.obj")):
            with open("features_confirmed.obj", 'r') as loadfile:
                features_conf = cPickle.load(loadfile)
        else:
            features_conf = data_io.get_features_db("TrainConfirmed")
            with open("features_confirmed.obj", 'w') as dumpfile:
                cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
                
                
        print("Getting features for valid papers from the database")
        if(os.path.exists("features_valid.obj")):
            with open("features_valid.obj", 'r') as loadfile:
                data = cPickle.load(loadfile)
        else:
            data = data_io.get_features_db("ValidPaper")
            with open("features_valid.obj", 'w') as dumpfile:
                cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
        
                 
        author_paper_ids = [x[:2] for x in data]
        features_valid = [x[2:] for x in data]    
        predictInts = []
        for tup in features_valid:
           a, b, c, d, e = tup
           predictInts.append((int(a), int(b), int(c), int(d), int(e)))
      
        predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32')
        for i, tup in enumerate(predictInts):
            a, b, c, d, e = tup
            predictsMat[i, 0] = a;  predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; 
        predict_set_x = theano.shared(predictsMat, borrow=True)       
    
        features = [x[2:] for x in features_deleted + features_conf]
        target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]
    
        featuresInts = []
        for tup in features:
           a, b, c, d, e = tup
           featuresInts.append((int(a), int(b), int(c), int(d), int(e)))
        

        featuresMat = np.ndarray(shape=(len(featuresInts), 5), dtype='int32')
        for i, tup in enumerate(featuresInts):
            a, b, c, d, e = tup
            featuresMat[i, 0] = a;  featuresMat[i, 1] = b; featuresMat[i, 2] = c; featuresMat[i, 3] = d; featuresMat[i, 4] = e; 
        
        targetInts = np.ndarray(shape=len(target), dtype='int32')
        for i,e in enumerate(target):
            targetInts[i] =  int(e)
            
        datasets = load_data(featuresMat, targetInts)#gen_data2()
    
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        
    
        batch_size = 20    # size of the minibatch

        # compute number of minibatches for training, validation and testing
        n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / batch_size
    
        ######################
        # BUILD ACTUAL MODEL #
        ######################
        print '... building the model'
    
        # allocate symbolic variables for the data
        
        size = T.lscalar()  
        index = T.lscalar()
        x = T.matrix('x', dtype='int32')  # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix
        y = T.ivector('y')  # the labels are presented as 1D vector of
    
                            # [int] labels
    
        rng = np.random.RandomState(113)
        
        # construct the MLP class
        classifier = MLP(rng=rng, input=x, n_in=5,
                         n_hidden=n_hidden, n_out=1)
    
        # the cost we minimize during training is the negative log likelihood of
        # the model plus the regularization terms (L1 and L2); cost is expressed
        # here symbolically
#        cost = classifier.negative_log_likelihood(y) \
#             + L1_reg * classifier.L1 \
#             + L2_reg * classifier.L2_sqr
        cost = classifier.cost(y) \
             + L1_reg * classifier.L1 \
             + L2_reg * classifier.L2_sqr
    
        # compiling a Theano function that computes the mistakes that are made
        # by the model on a minibatch
#        test_model = theano.function(inputs=[size],
#                outputs=[classifier.errors(y),classifier.getPredictions()],
#                 givens={
#                    x: test_set_x[0:size],
#                    y: test_set_y[0:size]}
#                )
#    
#        validate_model = theano.function(inputs=[size],
#                outputs=[classifier.errors(y),classifier.getPredictions()],
#                 givens={
#                    x:valid_set_x[0:size],
#                    y:valid_set_y[0:size]}
#                )

        test_model = theano.function(inputs=[index],
                outputs=classifier.errors(y),
                givens={
                    x: test_set_x[index * batch_size: (index + 1) * batch_size],
                    y: test_set_y[index * batch_size: (index + 1) * batch_size]})
    
        validate_model = theano.function(inputs=[index],
                outputs=classifier.errors(y),
                givens={
                    x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                    y: valid_set_y[index * batch_size:(index + 1) * batch_size]})
        
        predict_model = theano.function(inputs=[],
                outputs=classifier.predictions(),
                givens={
                    x: predict_set_x})
    
        # compute the gradient of cost with respect to theta (sotred in params)
        # the resulting gradients will be stored in a list gparams
        gparams = []
        for param in classifier.params:
            gparam = T.grad(cost, param)
            gparams.append(gparam)
    
        # specify how to update the parameters of the model as a dictionary
        updates = {}
        # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
        # same length, zip generates a list C of same size, where each element
        # is a pair formed from the two lists :
        #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
        for param, gparam in zip(classifier.params, gparams):
            updates[param] = param - learning_rate * gparam
    
        # compiling a Theano function `train_model` that returns the cost, but
        # in the same time updates the parameter of the model based on the rules
        # defined in `updates`
#        train_model = theano.function(inputs=[size],
#                                      outputs=cost,
#                updates=updates,
#                givens={
#                    x: train_set_x[0:size],
#                    y: train_set_y[0:size]}
#               )
        train_model = theano.function(inputs=[index],
            outputs=cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
    
        ###############
        # TRAIN MODEL #
        ###############
        print '... training'
        

        # early-stopping parameters
        patience = 1000000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                               # found
        improvement_threshold = 0.0995  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = min(n_train_batches, patience / 2)
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch
    
        best_params = None
        best_validation_loss = np.inf
        best_iter = 0
        test_score = 0.
        start_time = time.clock()
    
        epoch = 0
        done_looping = False
    
        
        
        
        while (epoch < n_epochs) and (not done_looping):
            #datasets = load_data(featuresMat, targetInts)#permute data()
#    
#            train_set_x, train_set_y = datasets[0]
#            valid_set_x, valid_set_y = datasets[1]
#            test_set_x, test_set_y = datasets[2]
            epoch = epoch + 1
            training_cost = []
            for minibatch_index in xrange(n_train_batches):
                minibatch_avg_cost = train_model(minibatch_index)
                training_cost.append(minibatch_avg_cost)
                # iteration number
                iter = (epoch - 1) * n_train_batches + minibatch_index
    
                if (iter + 1) % validation_frequency == 0:
                    # compute zero-one loss on validation set
                    validation_losses = [validate_model(i) for i
                                         in xrange(n_valid_batches)]
                    this_validation_loss = np.mean(validation_losses)
    
#                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
#                         (epoch, minibatch_index + 1, n_train_batches,
#                          this_validation_loss * 100.))
    
                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                               improvement_threshold:
                            patience = max(patience, iter * patience_increase)
    
                        best_validation_loss = this_validation_loss
                        best_iter = iter
                        best_params = []
                        best_params.append(classifier.params)
    
                        # test it on the test set
                        test_losses = [test_model(i) for i
                                       in xrange(n_test_batches)]
                        test_score = np.mean(test_losses)
    
#                        print(('     epoch %i, minibatch %i/%i, test error of '
#                               'best model %f %%') %
#                              (epoch, minibatch_index + 1, n_train_batches,
#                               test_score * 100.))
#    
                mean_cost = np.mean(training_cost)
                if(mean_cost < 0.0005):
                    done_looping = True
                    print "training cost: ", mean_cost
                    break
            print "Epoch ", epoch," training cost: ", mean_cost
       
    
        end_time = time.clock()
        print(('Optimization complete. Best validation score of %f %% '
               'obtained at iteration %i, with test performance %f %%') %
              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
        
        print("Saving the mlp best params")
        data_io.save_model(best_params, prefix="theano_")
        
        ############################
        #Making Predictions
        ############################
        
        print("Making predictions")
        predictions = predict_model()#classifier.predict_proba(features_valid)[:,1]
        predictions = list(predictions)
    
        author_predictions = defaultdict(list)
        paper_predictions = {}
    
        for (a_id, p_id), pred in zip(author_paper_ids, predictions):
            author_predictions[a_id].append((pred, p_id))
    
        for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
    
        print("Writing predictions to file")
        data_io.write_submission(paper_predictions, prefix="theano_")
예제 #24
0
def test_mlp(learning_rate=0.013,
             L1_reg=0.0001,
             L2_reg=0.0003,
             n_epochs=10000,
             n_hidden=50,
             n_hidden2=10):
    """
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
    
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
    
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
    
       """
    np.random.seed(17)
    print("Getting features for deleted papers from the database")
    features_deleted = None
    features_conf = None
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for valid papers from the database")
    if (os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    author_paper_ids = [x[:2] for x in data]
    features_valid = [x[2:] for x in data]

    features_validnp = np.array(features_valid, dtype='float64')

    #        predictInts = []
    #        for tup in features_valid:
    #           a, b, c, d, e = tup
    #           predictInts.append((int(a), int(b), int(c), int(d), int(e)))
    #
    #        predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32')
    #        for i, tup in enumerate(predictInts):
    #            a, b, c, d, e = tup
    #            predictsMat[i, 0] = a;  predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e;
    predict_set_x = theano.shared(features_validnp, borrow=True)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    featuresnp = np.array(features, dtype='float64')
    targetnp = np.array(target, dtype='int32')

    featuresnp -= np.mean(featuresnp, axis=0)
    featuresnp /= np.std(featuresnp, axis=0)

    cv = cross_validation.ShuffleSplit(len(features),
                                       n_iter=1,
                                       test_size=0.25,
                                       random_state=0)
    for train, test in cv:
        train_set_x = theano.shared(featuresnp[train], borrow=True)
        test_set_x = theano.shared(featuresnp[test], borrow=True)
        train_set_y = theano.shared(targetnp[train], borrow=True)
        test_set_y = theano.shared(targetnp[test], borrow=True)

    batch_size = 20  # size of the minibatch

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data

    #        size = T.lscalar()
    index = T.lscalar()
    x = T.matrix(
        'x', dtype='float64'
    )  # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix
    y = T.ivector('y')  # the labels are presented as 1D vector of

    # [int] labels

    rng = np.random.RandomState(113)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=featuresnp.shape[1],
                     n_hidden=n_hidden,
                     n_out=2,
                     n_hidden2=10)

    cost = classifier.negative_log_likelihood(y) \
         + L1_reg * classifier.L1 \
         + L2_reg * classifier.L2_sqr

    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    predict_model = theano.function(inputs=[],
                                    outputs=classifier.predictions(),
                                    givens={x: predict_set_x})

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a dictionary
    updates = OrderedDict()
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(classifier.params, gparams):
        updates[param] = param - learning_rate * gparam

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 1000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.0995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    best_params = None
    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        training_cost = []
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            training_cost.append(minibatch_avg_cost)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    test_model(i) for i in xrange(n_test_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    best_params = classifier.params

            if (best_validation_loss < 0.005):
                done_looping = True
                print "Best Validation cost: ", best_validation_loss
                break

        mean_cost = np.mean(training_cost)
        print "Epoch ", epoch, " training cost: ", mean_cost

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    print("Saving the mlp best params")
    data_io.save_model(best_params, prefix="theano_")

    ############################
    #Making Predictions
    ############################

    print("Making predictions")
    predictions = predict_model(
    )  #classifier.predict_proba(features_valid)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="theano_")
예제 #25
0
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.naive_bayes import BernoulliNB
import os
import cPickle
import data_io
from sklearn import manifold



print("Getting features for deleted papers from the database")
if(os.path.exists("features_deleted.obj")):
    with open("features_deleted.obj", 'r') as loadfile:
        features_deleted = cPickle.load(loadfile)
else:
    features_deleted = data_io.get_features_db("TrainDeleted")
    with open("features_deleted.obj", 'w') as dumpfile:
        cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

print("Getting features for confirmed papers from the database")
if(os.path.exists("features_confirmed.obj")):
    with open("features_confirmed.obj", 'r') as loadfile:
        features_conf = cPickle.load(loadfile)
else:
    features_conf = data_io.get_features_db("TrainConfirmed")
    with open("features_confirmed.obj", 'w') as dumpfile:
        cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

features = [x[2:] for x in features_deleted + features_conf]
target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]