def train_models(columns, rf=True, svm=False, logit=False): # Prepare the data dataCols = dict( [(colName, read_data_col('%s.train.dat' % fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS if colName in columns or colName == 'TF'] ) data, colnames = create_matrix(dataCols) cimt = colnames.index('TF') y = data[:, cimt] x = data[:, [c for c in range(len(colnames)) if c != cimt]] print >> sys.stderr, '\n>> Training model with cols:', colnames print >> sys.stderr, x # Fit the models models = {} if rf: rfModel = ensemble.RandomForestClassifier( n_estimators=100, ## criterion='entropy', verbose=1, n_jobs=-1, ## oob_score=True, ## min_samples_leaf=1, ) rfModel.fit(x, y) models['rf'] = rfModel if svm: svmModel = svm.SVC(kernel='rbf') svmModel.fit(x, y) models['svm'] = svmModel if logit: logitModel = linear_model.LogisticRegression() logitModel.fit(x, y) models['logit'] = logitModel return {'models' : models, 'colnames' : [c for c in colnames if c!='TF']}
def cross_validation(logit=False): # Split authors in train/validation confirmed, deleted = get_train() aids = confirmed.keys() shuffle(aids) cutPoint = int(len(aids) * F) aidsTrain = aids[:cutPoint] aidsValid = aids[cutPoint:] # Prepare the training data dataCols = dict( [(colName, read_data_col('%s.train.dat' % fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS] ) data, colnames = create_matrix(dataCols, exclude_aids=aidsValid) cimt = colnames.index('TF') ytrain = data[:, cimt] xtrain = data[:, [c for c in range(len(colnames)) if c != cimt]] # validation data, colnames2, pairs = create_matrix(dataCols, exclude_aids=aidsTrain, return_pairs=True) if colnames2 != colnames: raise WTF yvalid = data[:, cimt] xvalid = data[:, [c for c in range(len(colnames)) if c != cimt]] print >> sys.stderr, '==> %s train; %s valid <==' % (str(xtrain.shape), str(xvalid.shape)) # Train the model models = train_models(xtrain, ytrain, logit=logit) # Calculate performance of each algorithm performance = {} for algorithm in models: print >> sys.stderr, '\n\n--> %s <--\n' % algorithm print >> sys.stderr, models[algorithm] # make the predictions predBin = models[algorithm].predict(xvalid) pred = models[algorithm].predict_proba(xvalid) print >> sys.stderr, '' print >> sys.stderr, yvalid print >> sys.stderr, predBin print >> sys.stderr, pred # extract author-paper scores decorated = {} for n in range(len(pairs)): aid, pid = pairs[n] score = pred[n, 0] try: decorated[aid].append((score, pid, yvalid[n])) except KeyError: decorated[aid] = [(score, pid, yvalid[n])] # get the MAP MAPs = [] for aid in decorated: sortedPapers = decorated[aid] sortedPapers.sort() ntrue, ntot, MAPTerms = 0, 0, [] for s, p, tf in sortedPapers: ntot += 1 if tf == 1: ntrue += 1 MAPTerms.append(float(ntrue)/float(ntot)) MAPs.append(np.mean(MAPTerms)) # output results performance[algorithm] = np.mean(MAPs) print >> sys.stderr, '\n>> %s: %f' % (algorithm, np.mean(MAPs)) # Done print >> sys.stderr, '\n' return performance
## ('nameinit', '../name.train.dat', 4), # ('nname', '../nname.train.dat', 3), ('npapers', '../npapers.train.dat', 3), ('nauthors', '../nauthors.train.dat', 3), # ('coauthors', '../coauthors_diff.train.dat', 3), ## ('zcoauthors', '../coauthors_diff.train.dat', 4), ('affiliation', '../affiliation.train.dat', 3), ('year', '../year.train.dat', 3), ('nvalidated', '../nvalidated.train.dat', 3), ('sumcoauthors', '../sumcoauthors.train.dat', 3), ) if __name__ == '__main__': dataCols = dict([(colName, read_data_col(fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS]) data, colnames = create_matrix(dataCols) print colnames cimt = colnames.index('TF') y = data[:, cimt] x = data[:, [c for c in range(len(colnames)) if c != cimt]] N = len(y) ntrain = int(N * F) ttindices = range(N) shuffle(ttindices) yTrain = y[ttindices[:ntrain], :] xTrain = x[ttindices[:ntrain], :] yTest = y[ttindices[ntrain+1:], :] xTest = x[ttindices[ntrain+1:], :]