def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) clf = svm.SVC(decision_function_shape='ovo') clf.fit(fea, data["OpenStatus"][:178351]) print("Reading test file and making predictions") #features.compute_features("test_.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = clf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) #print(data['OpenStatusMod']) print("Extracting features") fea = features.extract_features(feature_names, data) #print(fea.columns) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0) print("Training the model, created RFC") #rf.fit(fea, data["OpenStatus"]) rf.fit(fea, data["OpenStatusMod"]) print("Reading test file and making predictions") #data = cu.get_dataframe(test_file) data = cu.get_dataframe(full_train_file) print("Reading data frame") data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) print("adding column") test_features = features.extract_features(feature_names, data) print("extract features") probs = rf.predict_proba(test_features) # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # print "new priors %s" %(new_priors) # print "old priors %s" %(old_priors) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) score()
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf,feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:140323]) print("Reading test file and making predictions") features.compute_features(test_file,feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_little) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") #classifier = MultinomialNB() #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance') classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_litte) test_features = features.extract_features(feature_names, data) probs = classifier.predict_proba(test_features) #print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) #old_priors = cu.get_priors(train_file) #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_litte) cu.write_submission(submission_litte, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def make_submission(): data = None if os.path.exists('data.pik'): print("Unpickeling the data") data = pickle.load(open('data.pik')) else: print("Reading the data") data = cu.get_dataframe(full_train_file) pickle.dump(data,open('data.pik','w')) fea = None if os.path.exists('fea.pik'): print("Unpickeling the fea") fea = pickle.load(open('fea.pik')) else: print("Extracting features") fea = features.extract_features(feature_names, data) pickle.dump(fea,open('fea.pik','w')) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=2) rf.fit(fea, data["OpenStatus"]) print "Features Importance:" imps = zip(rf.feature_importances_, feature_names,) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def cross_validate(): print("Reading the data") data = cu.get_dataframe(train_file) print("Cross-Validating") rf = RandomForestClassifier(n_estimators=10, verbose=1, compute_importances=True, n_jobs=2) cv = cross_validation.KFold(len(data), k=10, indices=False) results = [] for traincv, testcv in cv: print "\t-- cv [%d]"%len(results) print "\t","extracting features" #... feacv = features.extract_features(feature_names, traincv) print "\t","learning" rf.fit(feacv, data["OpenStatus"]) print "\t","predicting" probs = rf.predict_proba(testcv) print "\t","evaluating" results.append( llfun(target[testcv], [x["OpenStatus"] for x in probas]) ) print "LogLoss: " + str( np.array(results).mean() )
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Writing short sample features file") ''' preview in console ''' print(fea.values[:4]) print fea.describe().to_string() ''' save the X features data (matrix)''' # cu.write_submission(train_features_short_file, fea.values) np.savetxt(train_features_short_file, fea.values, fmt='%d', delimiter=',', newline='\n') '''train_features_short = [fea, data["OpenStatus"]]''' closed_reasons = data["OpenStatus"] closed_reasons_count = Counter(closed_reasons) print(closed_reasons_count.keys()[0:5]) closed_reasons_enum = map(closed_reasons_count.keys().index, closed_reasons) print(closed_reasons_enum[:9]) print("Saving submission to %s" % submission_file) ''' save the y supervised classification data (vector) ''' np.savetxt(train_y_short_file, closed_reasons_enum, fmt='%d', delimiter=',', newline='\n') '''
def main(): data = cu.get_dataframe("train.csv") data = data.sort_index(by="PostCreationDate") header = cu.get_header("train.csv") cutoff = datetime.datetime(2012, 7, 18) data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False) data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") X = features.extract_features(feature_names, data) y = [ class_labels[i] for i in data["OpenStatus"]] skf = StratifiedKFold(y, 10) result_f1 = 0 result_logloss = 0 fold = 1 for train, test in skf: print "Fold %d" % fold fold+=1 X_train = [X.ix[i] for i in train] y_train = [y[i] for i in train] X_test = [X.ix[i] for i in test] y_test = [y[i] for i in test] if (options.__dict__['classifier'] == 'erf'): classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1) elif(options.__dict__['classifier'] == 'mnb'): classifier = MultinomialNB() elif (options.__dict__['classifier'] == 'knn'): classifier = KNeighborsClassifier(n_neighbors=11) elif (options.__dict__['classifier'] == 'gbc'): classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(X_train, y_train) probs = classifier.predict_proba(X_test) if (options.__dict__['priors'] != 0): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_pred = probs y_test = np.array(y_test) logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15) y_pred = classifier.predict(X_test) f1 = f1_score(y_test, y_pred, pos_label=None) print "Log Loss: %f f1: %f" % (logloss, f1) result_f1 += f1 result_logloss += logloss print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
def main(): print("Reading the data") train_data = cu.get_dataframe(train_file) print("Extracting features") train_features = features.extract_features(feature_names, train_data) print("Reading test file and making predictions") test_data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, test_data) # print("Training random forest") # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) # rf.fit(train_features, train_data["OpenStatus"]) # probs = rf.predict_proba(test_features) # print("Training decision tree") # dt = DecisionTreeClassifier() # dt.fit(train_features, train_data["OpenStatus"]) # probs = dt.predict_proba(test_features) # print("Training adaboost") # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"]) # probs = ada.predict_proba(test_features) print("Training nearest neighbors") scaler = preprocessing.StandardScaler().fit(train_features) train_features_scaled = scaler.transform(train_features) test_features_scaled = scaler.transform(test_features) nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"]) probs = nbrs.predict_proba(test_features_scaled) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) actual = cu.get_actual(test_data["OpenStatus"]) print(cu.get_log_loss(actual, probs, 10**(-15)))
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf, feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0) gb.fit(fea, data["OpenStatus"]) dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) dt.fit(fea, data["OpenStatus"]) et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) et.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) probs2 = gb.predict_proba(test_features) probs3 = dt.predict_proba(test_features) probs4 = et.predict_proba(test_features) for i in range(0, len(probs)): for j in range(0,5): probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4 print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def test1(): data = cu.get_dataframe(full_train_file) print("Reading data frame") data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) print("fill na for bodymarkdown") data = data.fillna({"BodyMarkdown":""}, inplace=True) print("fill na for title") data = data.fillna({"Title":""}, inplace=True) print("fill na for post creation") data = data.fillna({"PostCreationDate":datetime.datetime(2008,07,31,21,42,52)}, inplace=True) print("fill na for owner creation") data = data.fillna({"OwnerCreationDate":datetime.datetime(2008,07,31,21,42,53)}, inplace=True) print("adding column") test_features = features.extract_features(feature_names, data)
def main(): print "get data" data = cu.get_dataframe("train.csv") print "sort by creation date" data = data.sort_index(by="PostCreationDate") print "cut off" header = cu.get_header("train.csv") splits = np.array_split(data, 3) frames = [splits[0], splits[1]] train_data = pd.concat(frames) test_data = splits[2] # cutoff = datetime.datetime(2012, 7, 18) print "write to csv" cu.write_sample("train_data.csv", header, train_data) train_data.to_csv(os.path.join(cu.data_path, "train_data.csv"), index=False, header=header) test_data.to_csv(os.path.join(cu.data_path, "test_data.csv"), index=False, header=header)
def hack(): print("Reading the data") data = cu.get_dataframe(train_file) counter = defaultdict(lambda: defaultdict(lambda: 0)) grouped = data.groupby('OpenStatus') for name, group in grouped: print name for wrds in group["BodyMarkdown"].apply(words): for word in wrds: counter[name][word]+=1 limit = 20 for name, word_dict in counter.items(): for word, count in sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)[:limit] print name, word, count
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = np.kron(np.ones((num_samples,1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
, "BodyHasCode" , "HourOfPost" , "WeekDayOfPost" , "DayOfPost" , "MonthOfPost" , "YearOfPost" , "MainTag" , "SecondTag" , "ThirdTag" , "FourthTag" , "FifthTag" , "TitlePmi" , "BodyPmi" , "TitleLenghtWords" , "TitleLength" , "BodyLenghtWords" , "BodyLength" ,"BagOfWords" ] # feature_names = ["BagOfWords"] data = cu.get_dataframe("../files/train-sample.csv") features = features.extract_features(feature_names, data) y = [ class_labels[i] for i in data["OpenStatus"]] print(features); # i = 0 # # for d in features: # print(d) # print(y[i]) # i = i + 1
def main(): start = time.time() result_sum = 0 data = cu.get_dataframe("data/train-sample.csv") #test_data = cu.get_dataframe("data/public_leaderboard.csv") #use this for evaluating public_leaderboard print 'data loaded' fea = features.extract_features(feature_names, data) #test_fea = features.extract_features(feature_names,test_data) #use this for evaluating public_leaderboard print 'features extracted' knn = KNeighborsClassifier(n_neighbors=10, weights='distance') # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] ques_status = [ 'open', 'too localized', 'not constructive', 'off topic', 'not a real question' ] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation skf = StratifiedKFold(y, k=10) fold = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] temp.append(fea['ReputationAtPostCreation'][i]) temp.append(fea['UserAge'][i]) temp.append(fea['Title'][i]) temp.append(fea['BodyMarkdown'][i]) temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] temp.append(fea['ReputationAtPostCreation'][i]) temp.append(fea['UserAge'][i]) temp.append(fea['Title'][i]) temp.append(fea['BodyMarkdown'][i]) temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i]) X_test.append(temp) y_test.append(y[i]) y_test = vectorize_actual(y_test) # vectorize y_test knn.fit(X_train, y_train) # train the classifier predictions = knn.predict_proba(X_test) # predict the test fold # evaluating the performance result = eval_tool.mcllfun(y_test, predictions) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) finish = time.time() print "completed in %0.4f seconds" % (finish - start) ### Use this code for evaluting public_leaderboard '''knn.fit(fea,y)
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2)) cu.write_submission("uniform_benchmark.csv", predictions)
def user_age(data): return pd.DataFrame.from_dict({ "UserAge": (data["PostCreationDate"] - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds()) }) ########################################################### def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: fea = fea.join(data[name]) else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) return fea if __name__ == "__main__": feature_names = [ "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime", "ReputationAtPostCreation", "TitleLength", "TitleWordCount", "UserAge" ] data = cu.get_dataframe("train-sample.csv") features = extract_features(feature_names, data) print(features)
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("test_.csv")) predictions = np.kron(np.ones((num_samples, 1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: #fea = fea.join(data[name]) fea = fea.join(data[name].apply(math.fabs)) else: #try: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) #except TypeError: # pass return fea if __name__ == "__main__": feature_names = [ "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime", "ReputationAtPostCreation", "TitleLength", "UserAge" ] data = cu.get_dataframe( "private_leaderboard_massaged.csv" ) #cu.get_dataframe("train-sample_October_9_2012_v2_massaged.csv") features = extract_features(feature_names, data) print(features) #print features['UserAge'] #print features['BodyLength'] #print features['TitleLength'] print features['NumTags']
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = [[0.0, 0.0, 0.0, 1.0, 0.0] for i in range(num_samples)] cu.write_submission("always_open_benchmark.csv", predictions)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:178351]) important_features = [] for x, i in enumerate(rf.feature_importances_): if i > np.average(rf.feature_importances_): important_features.append([str(x),i]) print 'Most important features:',important_features print("Reading test file and making predictions") #features.compute_features("test1.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01) y_pred=[] for i in probs: i=[float(k) for k in i] j=i.index(max(i)) if(j==3): y_pred.append("open") else: print "hi" y_pred.append("closed") y_true=[] a=0 b=0 test_reader = csv.reader(open(test_file)) headers=test_reader.next() for line in test_reader: if line[14]=='open': y_true.append("open") a=a+1 else: y_true.append("closed") b=b+1 print a print b print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"]) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
features['tags'] = ' '.join(features) return features def get_preprocess_data(data): data_p = [] for i in range(0, len(data)): data_p.append(row_to_features(data.ix[i])) data_p = pd.DataFrame(data_p) return data_p if __name__ == "__main__": print("get train data") train_data = cu.get_dataframe(train_file) print("get test data") test_data = cu.get_dataframe(test_file) train_data = train_data.replace(np.nan, '', regex=True) test_data = test_data.replace(np.nan, '', regex=True) train_data_p = get_preprocess_data(train_data) test_data_p = get_preprocess_data(test_data) text_vars = ['title', 'tags', 'body'] print("print tf idf") for var in text_vars: tf_idf = TfidfVectorizer(min_df=2, use_idf=1, smooth_idf=1, sublinear_tf=1, ngram_range=(1, 2), norm='l2')
else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) #re_qm = re.compile('\?') #fea['HasBodyQM'] = data.BodyMarkdown.apply(lambda b: re_qm.search(b) != None) #fea['IsNoob'] = data.ReputationAtPostCreation <= 1 #fea['IsLeech'] = data.OwnerUndeletedAnswerCountAtPostTime == 0 #print 'Added HasBodyQM: ', Counter(fea['HasBodyQM']) #print 'Added IsNoob: ', Counter(fea['IsNoob']) #print 'Added IsLeech: ', Counter(fea['IsLeech']) #print "generating extra features" #fea = fea.join(get_extra_features(data)) return fea if __name__ == "__main__": data = cu.get_dataframe("C:\\Projects\\ML\\stack\\data\\train-sample.csv") features = extract_features(data) print(features) def get_words(data, i): p = re.compile('^[a-z][a-z-]*[a-z]$') body = Body(data.BodyMarkdown[i]) words = body.get_unique_words() punks = [w for w in words if not p.match(w)] stops = [w for w in words if w in stopwords] words = [w for w in words if not w in stopwords and p.match(w)] return words, punks, stops
def main(): if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append( features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print("Reading the data from:" + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: depth = len(feature_names) print "depth=" + str(depth) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=depth, init=None, random_state=None) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "Fitting for fold " + str(fold) rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) # priors distribution over classes based on the training set #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] # priors distribution over classes based on the updated training set's last month new_priors = [ 0.03410911204982466, 0.01173872976800856, 0.018430671606251586, 0.926642216133641, 0.009079270442274271 ] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "depth=" + str(depth) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) else: #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=len(feature_names), init=None, random_state=None) rf.fit(fea, y) print("Reading test file " + test_file + " and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) # commented out, because we want to adjust probabilities to the last month data anyway #if do_full_train == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
if name in data: fea = fea.join(data[name]) else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) return fea if __name__=="__main__": # feature_names = [ "BodyLength" # , "NumTags" # , "OwnerUndeletedAnswerCountAtPostTime" # , "ReputationAtPostCreation" # , "TitleLength" # , "UserAge" # ] feature_names = [ "BodyLength" , "OwnerUndeletedAnswerCountAtPostTime" , "ReputationAtPostCreation" , "UserAge" , "Title" , "Tag1" , "Tag2" , "Tag3" , "Tag4" , "Tag5" ] data = cu.get_dataframe("C:\\Users\\Ben\\Temp\\StackOverflow\\train-sample.csv") features = extract_features(feature_names, data) print(features)
features['title'] = title features['label'] = row['OpenStatus'] features['tags'] = ' '.join(features) return features def get_preprocess_data(data): data_p = [] for i in range(0, len(data)): data_p.append(row_to_features(data.ix[i])) data_p = pd.DataFrame(data_p) return data_p if __name__=="__main__": print("get train data") train_data = cu.get_dataframe(train_file) print("get test data") test_data = cu.get_dataframe(test_file) train_data = train_data.replace(np.nan,'', regex=True) test_data = test_data.replace(np.nan,'', regex=True) train_data_p = get_preprocess_data(train_data) test_data_p = get_preprocess_data(test_data) text_vars = ['title', 'tags', 'body'] print("print tf idf") for var in text_vars: tf_idf = TfidfVectorizer(min_df=2, use_idf=1, smooth_idf=1, sublinear_tf=1, ngram_range=(1,2), norm='l2') tf_idf.fit(train_data_p[var].append(test_data_p[var])) probs = get_predictions(tf_idf, train_data_p[var], test_data_p[var], train_data_p['label']) for i in range(1,6): test_data_p[var+'_pred%d'%i] = probs[:, i-1]
fea = fea.join(getattr(features, camel_to_underscores(name))(data)) #re_qm = re.compile('\?') #fea['HasBodyQM'] = data.BodyMarkdown.apply(lambda b: re_qm.search(b) != None) #fea['IsNoob'] = data.ReputationAtPostCreation <= 1 #fea['IsLeech'] = data.OwnerUndeletedAnswerCountAtPostTime == 0 #print 'Added HasBodyQM: ', Counter(fea['HasBodyQM']) #print 'Added IsNoob: ', Counter(fea['IsNoob']) #print 'Added IsLeech: ', Counter(fea['IsLeech']) #print "generating extra features" #fea = fea.join(get_extra_features(data)) return fea if __name__=="__main__": data = cu.get_dataframe("C:\\Projects\\ML\\stack\\data\\train-sample.csv") features = extract_features(data) print(features) def get_words(data, i): p = re.compile('^[a-z][a-z-]*[a-z]$') body = Body(data.BodyMarkdown[i]) words = body.get_unique_words() punks = [w for w in words if not p.match(w)] stops = [w for w in words if w in stopwords] words = [w for w in words if not w in stopwords and p.match(w)] return words, punks, stops
return pd.DataFrame.from_dict({"TitleLength": data["Title"].apply(len)}) def user_age(data): return pd.DataFrame.from_dict({"UserAge": (data["PostCreationDate"] - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())}) ########################################################### def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: fea = fea.join(data[name]) else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) return fea if __name__=="__main__": feature_names = [ "BodyLength" , "NumTags" , "OwnerUndeletedAnswerCountAtPostTime" , "ReputationAtPostCreation" , "TitleLength" , "UserAge" ] data = cu.get_dataframe("train-sample.csv") features = extract_features(feature_names, data) print(features)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit(X_train, y_train) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Fitting" mnbayes.fit(fea,y)#, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit( X_train, y_train ) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Fitting" mnbayes.fit( fea, y ) #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: #fea = fea.join(data[name]) fea = fea.join(data[name].apply(math.fabs)) else: #try: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) #except TypeError: # pass return fea if __name__=="__main__": feature_names = [ "BodyLength" , "NumTags" , "OwnerUndeletedAnswerCountAtPostTime" , "ReputationAtPostCreation" , "TitleLength" , "UserAge" ] data = cu.get_dataframe("private_leaderboard_massaged.csv") #cu.get_dataframe("train-sample_October_9_2012_v2_massaged.csv") features = extract_features(feature_names, data) print(features) #print features['UserAge'] #print features['BodyLength'] #print features['TitleLength'] print features['NumTags']
def user_age(data): return pd.DataFrame.from_dict({ "UserAge": (data["PostCreationDate"] - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds()) }) ########################################################### def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: fea = fea.join(data[name]) else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) return fea if __name__ == "__main__": feature_names = [ "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime", "ReputationAtPostCreation", "TitleLength", "UserAge" ] data = cu.get_dataframe( "C:\\Users\\Ben\\Temp\\StackOverflow\\train-sample.csv") features = extract_features(feature_names, data) print(features)
{"UserAge": (data["PostCreationDate"] - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())} ) ########################################################### def extract_features(feature_names, data): fea = pd.DataFrame(index=data.index) for name in feature_names: if name in data: fea = fea.join(data[name]) else: fea = fea.join(getattr(features, camel_to_underscores(name))(data)) return fea if __name__ == "__main__": feature_names = [ "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime", "ReputationAtPostCreation", "TitleLength", "UserAge", ] data = cu.get_dataframe("/home/kesten/VCP/Git/ML/StackOverflowChallenge/data/train-sample.csv") features = extract_features(feature_names, data) print(features)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = [[0.0,0.0,0.0,1.0,0.0] for i in range(num_samples)] cu.write_submission("always_open_benchmark.csv", predictions)
def main(): start = time.time() if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append(features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) fea = features.extract_features(feature_names,data) print "Collecting statuses" y = [] for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "fitting this fold's data" rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) #_pred_probs = denormalize(rf.predict_proba(X_test)) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) else: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True print "Fitting" logit.fit(fea, y) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = logit.predict_proba(test_fea) if is_full_train_set == 0: print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)