def main(): if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append( features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print("Reading the data from:" + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: depth = len(feature_names) print "depth=" + str(depth) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=depth, init=None, random_state=None) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "Fitting for fold " + str(fold) rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) # priors distribution over classes based on the training set #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] # priors distribution over classes based on the updated training set's last month new_priors = [ 0.03410911204982466, 0.01173872976800856, 0.018430671606251586, 0.926642216133641, 0.009079270442274271 ] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "depth=" + str(depth) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) else: #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=len(feature_names), init=None, random_state=None) rf.fit(fea, y) print("Reading test file " + test_file + " and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) # commented out, because we want to adjust probabilities to the last month data anyway #if do_full_train == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit( X_train, y_train ) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Fitting" mnbayes.fit( fea, y ) #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): start = time.time() if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append(features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) fea = features.extract_features(feature_names,data) print "Collecting statuses" y = [] for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "fitting this fold's data" rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) #_pred_probs = denormalize(rf.predict_proba(X_test)) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) else: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True print "Fitting" logit.fit(fea, y) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = logit.predict_proba(test_fea) if is_full_train_set == 0: print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit(X_train, y_train) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Fitting" mnbayes.fit(fea,y)#, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)