def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, max_features=None) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, prefix="forest_")
def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))] featuresInts = [] for tup in features: a, b, c, d, e = tup featuresInts.append((int(a), int(b), int(c), int(d), int(e))) trainSet = zip(featuresInts, target) N = 5 #N : number of inputs/neurons for input layer H1 = 100 #H : number of neurons in hidden layer-1 #H2 = 5 M = 1 #number of outputs/neurons of the output layer learningRate = 0.1 epochs = 1000 #define layers of MLP keeping in mind that output of one layer is the number of inputs for the next layer layer0 = Layer(nNeurons=N, nInpsPerNeuron=-1, transferF='identity', ilayer=0, seed=13) #input layer layer1 = Layer(nNeurons=H1, nInpsPerNeuron=N, transferF='tanh', ilayer=1, seed=13) #hidden layer 1 layer2 = Layer(nNeurons=M, nInpsPerNeuron=H1, transferF='tanh', ilayer=2, seed=13) #output layer #layer3 = Layer(nNeurons=M, nInpsPerNeuron=H2, transferF='logistic', ilayer=3) #output layer layers = [layer0, layer1, layer2 ] mlp = Mlp(layers) mlp.showMlp() print "\n\nTraining Mlp for", epochs," Epochs.... please wait... " trainedMlp, iterations = mlp.trainMlp(trainSet, learningRate, epochs) print "\n\nFinished training of Mlp " trainedMlp.showMlp() print("Saving the classifier") data_io.save_model(mlp,prefix="mlp_")
def train(): ninp = 5 nhidden = 10 noutput = 1 inpDim = 5 targetDim = 1 net = buildNetwork(ninp, nhidden, noutput, bias=True) ds = SupervisedDataSet(inpDim, targetDim) print("Getting features for deleted papers from the database") features_deleted = None features_conf = None if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [[0] for x in range(len(features_deleted)) ] + [[1] for x in range(len(features_conf))] featuresInts = [] for tup in features: a, b, c, d, e = tup featuresInts.append((int(a), int(b), int(c), int(d), int(e))) trainset = zip(featuresInts, target) for x, y in trainset: ds.addSample(x, y) print "training..." trainer = BackpropTrainer(net, ds) trainer.trainUntilConvergence() with open("net_pybrain.obj", 'w') as dumpfile: cPickle.dump(net, dumpfile, cPickle.HIGHEST_PROTOCOL)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print "Getting features for valid papers from the database" data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print "Loading the classifier" classifier = data_io.load_model() print "Making predictions" predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print "Writing predictions to file" data_io.write_submission(paper_predictions)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append( (float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append((float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def main(): print("Getting features for valid papers from the database") if (os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def train(): ninp = 5 nhidden = 10 noutput = 1 inpDim = 5 targetDim = 1 net = buildNetwork(ninp, nhidden, noutput, bias=True) ds = SupervisedDataSet(inpDim, targetDim) print("Getting features for deleted papers from the database") features_deleted = None; features_conf = None if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))] featuresInts = [] for tup in features: a, b, c, d, e = tup featuresInts.append((int(a), int(b), int(c), int(d), int(e))) trainset = zip(featuresInts, target) for x, y in trainset: ds.addSample(x, y) print "training..." trainer = BackpropTrainer(net, ds) trainer.trainUntilConvergence() with open("net_pybrain.obj", 'w') as dumpfile: cPickle.dump(net, dumpfile, cPickle.HIGHEST_PROTOCOL)
def main(): print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") features = [x[2:] for x in features_conf + features_deleted] target = [0 for x in range(len(features_conf))] + [1 for x in range(len(features_deleted))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, max_features=None) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, prefix="forest_")
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] predictInts = [] for tup in features: a, b, c, d, e = tup predictInts.append((int(a), int(b), int(c), int(d), int(e))) print("Loading the classifier") mlp = data_io.load_model(prefix="mlp_") print("Making predictions") predictions = [] for x in predictInts : #Propagate the inputs forward to compute the outputs outp = list(x) #output of input layer i.e. output of previous layer to be used as input for next layer for layer in mlp.layers[1:] : #for all layers starting from the second layer for i in range(layer.nNeurons): layer.net[i] = weightedSum(outp, layer.W[1:,i]) + layer.W[0,i] layer.out[i] = g(layer.net[i], layer.transferF) #pass this weighted sum through the transfer function of this layer outp = layer.out predictions.append(mlp.layers[-1].out[0]) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="mlp_")
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() _, _, kw_features = all_features for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='int32') # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(featuresnp)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
from sklearn.datasets import make_circles from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from sklearn.decomposition import RandomizedPCA from sklearn.naive_bayes import BernoulliNB import os import cPickle import data_io from sklearn import manifold print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
''' Created on Jun 11, 2013 @author: navin.kolambkar ''' from __future__ import division from collections import defaultdict from sklearn.cross_validation import StratifiedKFold from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression import numpy as np import data_io if __name__ == '__main__': print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] X = np.array(features) y = np.array(target) print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] featuresTest = [x[2:] for x in data]
''' Created on Jun 11, 2013 @author: navin.kolambkar ''' from __future__ import division from collections import defaultdict from sklearn.cross_validation import StratifiedKFold from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression import numpy as np import data_io if __name__ == '__main__': print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] X = np.array(features) y = np.array(target) print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] featuresTest = [x[2:] for x in data]
def test_mlp(learning_rate=0.013, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50, n_hidden2=10): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(17) print ("Getting features for deleted papers from the database") features_deleted = None features_conf = None if os.path.exists("features_deleted.obj"): with open("features_deleted.obj", "r") as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", "w") as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print ("Getting features for confirmed papers from the database") if os.path.exists("features_confirmed.obj"): with open("features_confirmed.obj", "r") as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", "w") as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print ("Getting features for valid papers from the database") if os.path.exists("features_valid.obj"): with open("features_valid.obj", "r") as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", "w") as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features_valid = [x[2:] for x in data] features_validnp = np.array(features_valid, dtype="float64") # predictInts = [] # for tup in features_valid: # a, b, c, d, e = tup # predictInts.append((int(a), int(b), int(c), int(d), int(e))) # # predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') # for i, tup in enumerate(predictInts): # a, b, c, d, e = tup # predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; predict_set_x = theano.shared(features_validnp, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] featuresnp = np.array(features, dtype="float64") targetnp = np.array(target, dtype="int32") featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0) for train, test in cv: train_set_x = theano.shared(featuresnp[train], borrow=True) test_set_x = theano.shared(featuresnp[test], borrow=True) train_set_y = theano.shared(targetnp[train], borrow=True) test_set_y = theano.shared(targetnp[test], borrow=True) batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print "... building the model" # allocate symbolic variables for the data # size = T.lscalar() index = T.lscalar() x = T.matrix("x", dtype="float64") # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2, n_hidden2=10) cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size : (index + 1) * batch_size], y: test_set_y[index * batch_size : (index + 1) * batch_size], }, ) predict_model = theano.function(inputs=[], outputs=classifier.predictions(), givens={x: predict_set_x}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = OrderedDict() # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size : (index + 1) * batch_size], y: train_set_y[index * batch_size : (index + 1) * batch_size], }, ) ############### # TRAIN MODEL # ############### print "... training" # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0.0 start_time = time.clock() epoch = 0 done_looping = False best_params = None while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [test_model(i) for i in xrange(n_test_batches)] this_validation_loss = np.mean(validation_losses) print ( "epoch %i, minibatch %i/%i, validation error %f %%" % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = classifier.params if best_validation_loss < 0.005: done_looping = True print "Best Validation cost: ", best_validation_loss break mean_cost = np.mean(training_cost) print "Epoch ", epoch, " training cost: ", mean_cost end_time = time.clock() print ( ( "Optimization complete. Best validation score of %f %% " "obtained at iteration %i, with test performance %f %%" ) % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0) ) print >> sys.stderr, ( "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0) ) print ("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_") ############################ # Making Predictions ############################ print ("Making predictions") predictions = predict_model() # classifier.predict_proba(features_valid)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print ("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="theano_")
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_keywords_feature() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted + kw_confirmed for i in range(len(features)): _, _, ckw = kw_features[i] features[i] += (ckw, ) featuresnp = np.array(features, dtype='float32') targetnp = np.array(target, dtype='int32') featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) # Set the parameters by cross-validation # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(featuresnp, targetnp, test_size=0.3, random_state=0) tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=4, score_func=score, n_jobs=4, verbose=2) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.cv_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted + kw_confirmed for i in range(len(features)): features[i] += tuple(kw_features[i][2:]) #Simple K-Fold cross validation. 10 folds. #cv = cross_validation.KFold(len(features), n_folds=5) cv = cross_validation.ShuffleSplit(len(features), n_iter=4, test_size=0.4, random_state=0) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=0, compute_importances=True) featuresnp = np.array(features, dtype='int32') targetnp = np.array(target, dtype='int32') # with open("wrong_predictions.txt", 'w' ) as wp: # class1count = 0; class2count =0; rpredictions = 0 # for train, test in cv: # x_train = featuresnp[train]; y_train = targetnp[train] # x_test = featuresnp[test]; y_test = targetnp[test] # classifier.fit(x_train, y_train) # predictions = classifier.predict_proba(x_test) # pred_classes = classifier.predict(x_test) # for i in range(len(y_test)): # # if y_test[i] != pred_classes[i] : # if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6): # class1count+=1; # if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6): # class2count+=1; # line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"]) # wp.write(line) # else: # if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6): # rpredictions+=1; # # print "number of wrong predictions of deleted class: ", class1count # print "number of wrong predictions of confirmed class: ", class2count # print "number of right predictions with close probas", rpredictions # for train, test in cv: # print "total number of test examples: ", len(test) # classifier.fit(featuresnp, targetnp) # importances = classifier.feature_importances_ ## std = np.std([tree.feature_importances_ for tree in forest.estimators_], ## axis=0) # indices = np.argsort(importances)[::-1] # # # Print the feature ranking # print("Feature ranking:") # # for f in range(len(indices)): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # # numFeatures = 15 # prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32") # for i in range(prunedFeatures.shape[0]): # for j, fi in enumerate(indices[0:numFeatures]): # prunedFeatures[i,j] = featuresnp[i, fi] # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) # results = cross_validation.cross_val_score(classifier, X=featuresnp, y=targetnp, cv=cv, n_jobs=4, verbose=True) #print out the mean of the cross-validated results print "Results: ", results print "Results: " + str(np.array(results).mean())
def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted+kw_confirmed for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) #Simple K-Fold cross validation. 10 folds. #cv = cross_validation.KFold(len(features), n_folds=5) cv = cross_validation.ShuffleSplit(len(features), n_iter=4, test_size=0.4, random_state=0) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=0, compute_importances=True ) featuresnp = np.array(features, dtype='int32') targetnp = np.array(target, dtype='int32') # with open("wrong_predictions.txt", 'w' ) as wp: # class1count = 0; class2count =0; rpredictions = 0 # for train, test in cv: # x_train = featuresnp[train]; y_train = targetnp[train] # x_test = featuresnp[test]; y_test = targetnp[test] # classifier.fit(x_train, y_train) # predictions = classifier.predict_proba(x_test) # pred_classes = classifier.predict(x_test) # for i in range(len(y_test)): # # if y_test[i] != pred_classes[i] : # if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6): # class1count+=1; # if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6): # class2count+=1; # line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"]) # wp.write(line) # else: # if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6): # rpredictions+=1; # # print "number of wrong predictions of deleted class: ", class1count # print "number of wrong predictions of confirmed class: ", class2count # print "number of right predictions with close probas", rpredictions # for train, test in cv: # print "total number of test examples: ", len(test) # classifier.fit(featuresnp, targetnp) # importances = classifier.feature_importances_ ## std = np.std([tree.feature_importances_ for tree in forest.estimators_], ## axis=0) # indices = np.argsort(importances)[::-1] # # # Print the feature ranking # print("Feature ranking:") # # for f in range(len(indices)): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # # numFeatures = 15 # prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32") # for i in range(prunedFeatures.shape[0]): # for j, fi in enumerate(indices[0:numFeatures]): # prunedFeatures[i,j] = featuresnp[i, fi] # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) # results = cross_validation.cross_val_score(classifier, X=featuresnp, y=targetnp, cv=cv, n_jobs=4, verbose=True) #print out the mean of the cross-validated results print "Results: ", results print "Results: " + str( np.array(results).mean())
def test_mlp(learning_rate=0.013, L1_reg=0.00, L2_reg=0.0003, n_epochs=300, n_hidden=200): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(0) print("Getting features for deleted papers from the database") features_deleted = None; features_conf = None if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features_valid = [x[2:] for x in data] predictInts = [] for tup in features_valid: a, b, c, d, e = tup predictInts.append((int(a), int(b), int(c), int(d), int(e))) predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') for i, tup in enumerate(predictInts): a, b, c, d, e = tup predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; predict_set_x = theano.shared(predictsMat, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] featuresInts = [] for tup in features: a, b, c, d, e = tup featuresInts.append((int(a), int(b), int(c), int(d), int(e))) featuresMat = np.ndarray(shape=(len(featuresInts), 5), dtype='int32') for i, tup in enumerate(featuresInts): a, b, c, d, e = tup featuresMat[i, 0] = a; featuresMat[i, 1] = b; featuresMat[i, 2] = c; featuresMat[i, 3] = d; featuresMat[i, 4] = e; targetInts = np.ndarray(shape=len(target), dtype='int32') for i,e in enumerate(target): targetInts[i] = int(e) datasets = load_data(featuresMat, targetInts)#gen_data2() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data size = T.lscalar() index = T.lscalar() x = T.matrix('x', dtype='int32') # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=5, n_hidden=n_hidden, n_out=1) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically # cost = classifier.negative_log_likelihood(y) \ # + L1_reg * classifier.L1 \ # + L2_reg * classifier.L2_sqr cost = classifier.cost(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch # test_model = theano.function(inputs=[size], # outputs=[classifier.errors(y),classifier.getPredictions()], # givens={ # x: test_set_x[0:size], # y: test_set_y[0:size]} # ) # # validate_model = theano.function(inputs=[size], # outputs=[classifier.errors(y),classifier.getPredictions()], # givens={ # x:valid_set_x[0:size], # y:valid_set_y[0:size]} # ) test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) predict_model = theano.function(inputs=[], outputs=classifier.predictions(), givens={ x: predict_set_x}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = {} # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` # train_model = theano.function(inputs=[size], # outputs=cost, # updates=updates, # givens={ # x: train_set_x[0:size], # y: train_set_y[0:size]} # ) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): #datasets = load_data(featuresMat, targetInts)#permute data() # # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) # print('epoch %i, minibatch %i/%i, validation error %f %%' % # (epoch, minibatch_index + 1, n_train_batches, # this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = [] best_params.append(classifier.params) # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) # print((' epoch %i, minibatch %i/%i, test error of ' # 'best model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) # mean_cost = np.mean(training_cost) if(mean_cost < 0.0005): done_looping = True print "training cost: ", mean_cost break print "Epoch ", epoch," training cost: ", mean_cost end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_") ############################ #Making Predictions ############################ print("Making predictions") predictions = predict_model()#classifier.predict_proba(features_valid)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="theano_")
def test_mlp(learning_rate=0.013, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50, n_hidden2=10): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(17) print("Getting features for deleted papers from the database") features_deleted = None features_conf = None if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for valid papers from the database") if (os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features_valid = [x[2:] for x in data] features_validnp = np.array(features_valid, dtype='float64') # predictInts = [] # for tup in features_valid: # a, b, c, d, e = tup # predictInts.append((int(a), int(b), int(c), int(d), int(e))) # # predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') # for i, tup in enumerate(predictInts): # a, b, c, d, e = tup # predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; predict_set_x = theano.shared(features_validnp, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] featuresnp = np.array(features, dtype='float64') targetnp = np.array(target, dtype='int32') featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0) for train, test in cv: train_set_x = theano.shared(featuresnp[train], borrow=True) test_set_x = theano.shared(featuresnp[test], borrow=True) train_set_y = theano.shared(targetnp[train], borrow=True) test_set_y = theano.shared(targetnp[test], borrow=True) batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data # size = T.lscalar() index = T.lscalar() x = T.matrix( 'x', dtype='float64' ) # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2, n_hidden2=10) cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) predict_model = theano.function(inputs=[], outputs=classifier.predictions(), givens={x: predict_set_x}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = OrderedDict() # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False best_params = None while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ test_model(i) for i in xrange(n_test_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = classifier.params if (best_validation_loss < 0.005): done_looping = True print "Best Validation cost: ", best_validation_loss break mean_cost = np.mean(training_cost) print "Epoch ", epoch, " training cost: ", mean_cost end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_") ############################ #Making Predictions ############################ print("Making predictions") predictions = predict_model( ) #classifier.predict_proba(features_valid)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="theano_")
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from sklearn.decomposition import RandomizedPCA from sklearn.naive_bayes import BernoulliNB import os import cPickle import data_io from sklearn import manifold print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]