def __init__(self): self.dataSets = os.listdir('data/') self.categories = ["geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event self.classifierAFeatures = ['wordFeatures'] self.classifierBFeatures = ['category', 'location','wordOverlapSimple','wordOverlapUser'] self.annotation = {} self.candidates = {} self.result = defaultdict(self.resultDictionary) self.cm = [] self.informativeFeatures = [] self.accuracy = [] self.choice = 0 # real test or dev test? self.realTest = False if len(sys.argv) == 2: if sys.argv[1] == "-test": self.realTest = True if self.realTest: print("\nThe system is running in TEST mode.\n") self.ITERATIONS = 1 else: print("\nThe system is running in DEVTEST mode.\n") self.ITERATIONS = 10 self.__loadDataSet() self.featureSelector = FeatureSelector(self.candidates) self._trainClassifiers() if self.realTest: self._saveClassifiers()
def __init__(self): self.dataSets = os.listdir('data/') self.categories = [ "geen_event", "sport", "entertainment", "bijeenkomst", "incident", "anders" ] # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event self.classifierAFeatures = ['wordFeatures'] self.classifierBFeatures = [ 'category', 'location', 'wordOverlapSimple', 'wordOverlapUser' ] self.annotation = {} self.candidates = {} self.result = defaultdict(self.resultDictionary) self.cm = [] self.informativeFeatures = [] self.accuracy = [] self.choice = 0 # real test or dev test? self.realTest = False if len(sys.argv) == 2: if sys.argv[1] == "-test": self.realTest = True if self.realTest: print("\nThe system is running in TEST mode.\n") self.ITERATIONS = 1 else: print("\nThe system is running in DEVTEST mode.\n") self.ITERATIONS = 10 self.__loadDataSet() self.featureSelector = FeatureSelector(self.candidates) self._trainClassifiers() if self.realTest: self._saveClassifiers()
def training_and_classification_with_kfold_cross_validation(collection_name, k): ''' Training and classification of an autotagger using k-fold cross validation ''' _split_metadata_and_features(collection_name, k) for i in range(1,k+1): # Create a gaia dataset with the training set print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i training_features='train/%s_features__fold%d.tsv' % (collection_name, i) chunk_size=5000 dataset_suffix="fold%d" % i replace_dataset=True dataset_creator = DatasetCreator(collection_name) dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset) # Feature selection over the gaia dataset print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i dataset='dbs/%s__fold%d.db' % (collection_name, i) pca_covered_variance=75 include_highlevel=True feature_selector = FeatureSelector() feature_selector.select(dataset, pca_covered_variance, include_highlevel) # Autotag a given test set print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i) training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i) test_features='test/%s_features__fold%d.tsv' % (collection_name, i) output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i) output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i) metric='LC' num_sim=18 threshold=0.2 autotagger = Autotagger() autotagger.train(dataset, training_metadata) autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False) autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
def __init__(self): self.dataSets = os.listdir('data/') self.candidates = {} self._loadDataSet() featureSelector = FeatureSelector(self.candidates) #self.featuresCat = [] #self.featuresBi = [] self.events = [] # detecteer events for h in self.candidates: for t in self.candidates[h]: candidate = self.candidates[h][t] featuresCat = featureSelector.getFeatures(candidate, ['wordFeatures']) featureSelector.addCategoryClassifier(self.classifierCat) label = self.classifierCat.classify(featuresCat) featuresBi = featureSelector.getFeatures(candidate,['category', 'location','wordOverlapSimple','wordOverlapUser']) classifierBiLabel = self.classifierBi.classify(featuresBi) if classifierBiLabel != "geen_event": self.events.append((candidate,classifierBiLabel))
Y_train = pd.factorize(train[labelName])[0] X_train_origin = train.iloc[:, 0:train.columns.size - 1].copy() Y_test = pd.factorize(test[labelName])[0] X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy() scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X_train_origin) #scaling of training data X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()), columns=X_train_origin.columns) # apply same transformation to test data X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()), columns=X_test_origin.columns) trainTmp = X_train_origin.copy() trainTmp[labelName] = Y_train fs = FeatureSelector(trainTmp) featureSize = data.columns.size threshold = 10 clfNames = ["lbfgs", "adam", "sgd", "randomForest", "decisionTree", "rbf", "poly", "linear", "knn"] while(featureSize >= threshold): features = fs.featureSelectionSelectKBestClassification(featureSize,labelName) print(features) clfs = [MLPClassifier(solver='lbfgs', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500), MLPClassifier(solver='adam', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500), MLPClassifier(solver='sgd', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500), RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42),tree.DecisionTreeClassifier(), svm.SVC(kernel='rbf', C=10.0, gamma=0.1, probability=True), svm.SVC(kernel='poly', C=10.0, degree=3, probability=True),
for categorised in categorised_attributes: if categorised in df: df[categorised] = df[categorised].astype('category').cat.codes df.fillna(0, inplace=True) df = df.drop(columns=[x for x in ignored_attributes if x in df]) return df print("Loading training data...") train_data_frame = pandas.read_csv('kaggle_data/train.csv') train_data_frame = replace_dummies(train_data_frame) print("Selecting top features for use...") selector = FeatureSelector(train_data_frame, 'SalePrice') top_features = selector.rank_features(70) top_named_features = [train_data_frame.columns[x] for x in top_features] top_named_features.append('SalePrice') print("Top features: ") print(top_named_features) print("Reloading train data with only top features") train_data_frame = pandas.read_csv('kaggle_data/train.csv', usecols=top_named_features) train_data_frame = replace_dummies(train_data_frame) train_data_frame = train_data_frame.drop(axis=1, columns=[ x
#preparing test and training for final evaluation: using copies not to create problems scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) #don't cheat: fit only on training data scaler.fit(train) trainTmp = pd.DataFrame(scaler.transform(train.copy()), columns=train.columns) # apply same transformation to test data testTmp = pd.DataFrame(scaler.transform(test.copy()), columns=test.columns) fsSize = train.columns.size threshold = 10 fs = FeatureSelector(trainTmp.copy()) clfNames = ["lbfgs", "adam", "sgd", "randomForest", "decisionTree", "linear", "poly", "rbf", "Knn"] while(fsSize >= threshold): features = fs.featureSelectionSelectKBestRegression(fsSize, labelName) print("FEATURES NEL WHILE ", features) #C=1e3 svr_rbf = SVR(kernel='rbf', C=1) svr_lin = SVR(kernel='linear', C=1) svr_poly = SVR(kernel='poly', C=1) clfs = [MLPRegressor(solver='lbfgs', alpha = 10.0, hidden_layer_sizes=(10,), activation="tanh",epsilon=1e-4), MLPRegressor(solver='adam', alpha=10.0, hidden_layer_sizes=(10,), activation="tanh", epsilon=1e-4), MLPRegressor(solver='sgd', alpha=10.0, hidden_layer_sizes=(10,), activation="tanh", epsilon=1e-4), RandomForestRegressor(n_jobs=10, random_state=45, n_estimators=10), DecisionTreeRegressor(), svr_lin, svr_poly, svr_rbf,
def __init__(self, input_, labels, train_percent, test_percent): self.input_ = input_ self.labels = labels self.define_indices(train_percent, test_percent) self.featSel = FeatureSelector() self.logreg_clf = LogisticRegression()
class ClassifierMethods(object): def __init__(self, input_, labels, train_percent, test_percent): self.input_ = input_ self.labels = labels self.define_indices(train_percent, test_percent) self.featSel = FeatureSelector() self.logreg_clf = LogisticRegression() def define_indices(self, train_percent, test_percent): n_rows = 4500 #input_[i].shape[0] train_end = int(n_rows * train_percent) test_end = int(train_end + n_rows * test_percent) self.train_indices = (0, train_end) self.test_indices = (train_end, test_end) self.validation_indices = (test_end, -1) def plot_roc_curve(self): pass def log_reg(self, X_train, y_train, X_test): self.logreg_clf.fit(X_train, y_train) return self.log_regclf.predict_proba(X_test), self.clf.predict(X_test) def rank_features(self): for i in range(9): X_train = self.input_[i][0:self.train_indices[1], :] y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, ) print('--------------------------------------------') print('Select k-best') print(self.featSel.kBest_score(X_train, y_train)) print('Select extratrees') print(self.featSel.extree_score(X_train, y_train)) print('----------------------------------------------') def class_rdnforest(self, X_train_score, X_input_score, max_depth, min_samples_split, random_state, X_train, X_input, y_train, y_input, time_ms, figname='default_rdnfor.png', last=False): rdm_for = RandomForestClassifier(max_depth=max_depth, random_state=random_state, min_samples_split=min_samples_split) start = time.process_time() rdm_for.fit(X_train, y_train) end = time.process_time() time_ms = (end - start) + time_ms if not last: X_train_score = X_train_score + (np.array( rdm_for.predict_proba(X_train))) X_input_score = X_input_score + (np.array( rdm_for.predict_proba(X_input))) return X_train_score, X_input_score, time_ms else: y_pred = np.array(rdm_for.predict(X_input)) y_input.reshape(900, ) classes = np.unique(y_input) plot_confussion_matrix(y_input, y_pred, classes, plot_name=figname, cmap=plt.cm.Blues, show=False) return time_ms def class_ada(self, X_train, y_train, X_input, y_input, X_train_score, X_input_score, time_ms, figname='default_ada.png', last=False): ada_clf = AdaBoostClassifier(random_state=2, learning_rate=0.1) start = time.process_time() ada_clf.fit(X_train, y_train) end = time.process_time() time_ms = (end - start) + time_ms if not last: X_train_score = X_train_score + (np.array( ada_clf.predict_proba(X_train))) X_input_score = X_input_score + (np.array( ada_clf.predict_proba(X_input))) return X_train_score, X_input_score, time_ms else: y_pred = np.array(ada_clf.predict(X_input)) y_input.reshape(900, ) classes = np.unique(y_input) plot_confussion_matrix(y_input, y_pred, classes, plot_name=figname, cmap=plt.cm.Blues, show=False) return time_ms def train_and_class(self, test=False): rdn_for = np.zeros((900, 9)) ada_score = np.zeros((900, 9)) rdn_train = np.zeros((2700, 9)) ada_train_score = np.zeros((2700, 9)) time_rdfor = 0 time_ada = 0 for i in range(9): X_train = self.input_[i][0:self.train_indices[1], :] y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, ) if not test: X_input = self.input_[i][self.validation_indices[0]:, :] y_input = (self.labels[self.validation_indices[0]:]).reshape( 900, ) else: X_input = self.input_[i][ self.test_indices[0]:self.test_indices[1], :] y_input = (self.labels[self.test_indices[0]:self. test_indices[1]]).reshape(900, ) rdn_train, rdn_for, time_rdfor = self.class_rdnforest( time_ms=time_rdfor, X_input_score=rdn_for, X_train_score=rdn_train, max_depth=4, min_samples_split=2, random_state=2, X_train=X_train, X_input=X_input, y_train=y_train, y_input=y_input) ada_train_score, ada_score, time_ada = self.class_ada( time_ms=time_ada, X_train=X_train, X_input=X_input, X_train_score=ada_train_score, X_input_score=ada_score, y_input=y_input, y_train=y_train) time_rdfor = self.class_rdnforest(time_ms=time_rdfor, X_train_score=None, X_input_score=None, max_depth=4, min_samples_split=2, random_state=2, X_train=rdn_train, X_input=rdn_for, y_train=y_train, y_input=y_input, figname='rdfor_class.png', last=True) time_ada = self.class_ada(time_ms=time_ada, X_train_score=None, X_input_score=None, X_train=X_train, X_input=X_input, y_train=y_train, y_input=y_input, figname='ada_class.png', last=True) print('Training time for Random Forest: %f' % time_rdfor) print('Training time for AdaBoost: %f' % time_ada) def train_and_class_selfeat(self, kbest=True, test=False): rdn_for = np.zeros((900, 9)) ada_score = np.zeros((900, 9)) rdn_train = np.zeros((2700, 9)) ada_train_score = np.zeros((2700, 9)) time_rdfor = 0 time_ada = 0 for i in range(9): X_train = self.input_[i][0:self.train_indices[1], :] y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, ) if not test: X_input = self.input_[i][self.validation_indices[0]:, :] y_input = (self.labels[self.validation_indices[0]:]).reshape( 900, ) else: X_input = self.input_[i][ self.test_indices[0]:self.test_indices[1], :] y_input = (self.labels[self.test_indices[0]:self. test_indices[1]]).reshape(900, ) if kbest: print('Seleccionando k-mejores... K-best') X_train, X_input = self.featSel.kBest_fit(X_train=X_train, X_input=X_input) else: print('Seleccionando por extra trees...') X_train, X_input = self.featSel.extree_fit(X_train=X_train, X_input=X_input) rdn_train, rdn_for, time_rdfor = self.class_rdnforest( time_ms=time_rdfor, X_input_score=rdn_for, X_train_score=rdn_train, max_depth=4, min_samples_split=2, random_state=2, X_train=X_train, X_input=X_input, y_train=y_train, y_input=y_input) ada_train_score, ada_score, time_ada = self.class_ada( time_ms=time_ada, X_train=X_train, X_input=X_input, X_train_score=ada_train_score, X_input_score=ada_score, y_input=y_input, y_train=y_train) time_rdfor = self.class_rdnforest( time_ms=time_rdfor, X_train_score=None, X_input_score=None, max_depth=4, min_samples_split=2, random_state=2, X_train=rdn_train, X_input=rdn_for, y_train=y_train, y_input=y_input, figname=['kbestfeat_rdfor.png' if kbest else 'extree_rdfor.png' ][0], last=True) time_ada = self.class_ada( time_ms=time_ada, X_train_score=None, X_input_score=None, X_train=X_train, X_input=X_input, y_train=y_train, y_input=y_input, figname=['kbestfeat_ada.png' if kbest else 'extree_ada.png'][0], last=True) print('Training time for Random Forest: %f' % time_rdfor) print('Training time for AdaBoost: %f' % time_ada)
# You should have received a copy of the GNU General Public License # along with music-autotagging-msordo. If not, see <http://www.gnu.org/licenses/>. # Written by Mohamed Sordo (@neomoha) # Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com # Website: http://msordo.weebly.com import os, sys, argparse from FeatureSelector import FeatureSelector if __name__ == '__main__': parser = argparse.ArgumentParser(description='Feature selection over the Gaia dataset') parser.add_argument('collection_name', help='Name of the collection') parser.add_argument('--dataset', default=None, help='Path to the gaia dataset (default="dbs/COLLECTIONNAME.db")') parser.add_argument('--pca-covered-variance', type=int, default=75, help='The PCA transformation should keep at least this percentage of variance (default=75)') parser.add_argument('--exclude-highlevel', help='exclude high level descriptors', action="store_true") args = parser.parse_args() if args.dataset is None: args.dataset = "dbs/"+args.collection_name+".db" if not os.path.exists(args.dataset): print "Dataset '%s' not found" % args.dataset sys.exit(-1) print args feature_selector = FeatureSelector() feature_selector.select(args.dataset, args.pca_covered_variance, not args.exclude_highlevel)
import lightgbm as lgb import xgboost as xgb from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score from FeatureSelector import FeatureSelector from preproccess import read_data, prepare_and_scale_data train, test = read_data() train, y_train, test, id_test = prepare_and_scale_data(train, test) feature_selector = FeatureSelector(train,test) predictors = feature_selector.feature_selection_based_on_genetic_algo(train,test,y_train) train = train[predictors] en = LinearRegression(fit_intercept=True, n_jobs=-1) rf = RandomForestRegressor(n_estimators=100, n_jobs=2, max_depth=6,) et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, max_depth=6,) xgbm = xgb.sklearn.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=1000, base_score=y_train.mean()) lgbm = lgb.LGBMRegressor(nthread=3,silent=True,learning_rate=0.05,max_depth=7,n_estimators=1000)
sample_df.columns sample_df.to_csv(submission_file, index=False) print 'Done' ''' #FeatureSelector().select_features(write=True) #a = pd.Series( DataProcessor().get_all_commands_series()) #print a #commands = pd.Series(DataProcessor().get_all_commands_series()) #print commands.keys() sample_df = pd.read_csv(sample_submission_file) result_df = pd.read_csv('outputs/FeatureSelector/all_500_500.csv') cols = select_k_best(result_df, 200) result_df = result_df[cols] result_df.loc[:, 'Label'] = FeatureSelector().get_labels_array_all() result_df.to_csv('outputs/FeatureSelector/selected_all.csv') v = pd.read_csv(validation_file) validation_set = v['Label'] classification_res = [] clf = LOF(n_neighbors=20, contamination=0.1) #for num in range(0, 40): # print "******* User {} ********".format(num) # ClassificationModel(user_num=num, df=result_df).optimize_parameters() for num in range(0, 10): print "******* User {} ********".format(num) classification_res.extend( ClassificationModel(user_num=num, df=result_df, model=clf).predictLabels())
class ClassifierCreator: def __init__(self): self.dataSets = os.listdir('data/') self.categories = [ "geen_event", "sport", "entertainment", "bijeenkomst", "incident", "anders" ] # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event self.classifierAFeatures = ['wordFeatures'] self.classifierBFeatures = [ 'category', 'location', 'wordOverlapSimple', 'wordOverlapUser' ] self.annotation = {} self.candidates = {} self.result = defaultdict(self.resultDictionary) self.cm = [] self.informativeFeatures = [] self.accuracy = [] self.choice = 0 # real test or dev test? self.realTest = False if len(sys.argv) == 2: if sys.argv[1] == "-test": self.realTest = True if self.realTest: print("\nThe system is running in TEST mode.\n") self.ITERATIONS = 1 else: print("\nThe system is running in DEVTEST mode.\n") self.ITERATIONS = 10 self.__loadDataSet() self.featureSelector = FeatureSelector(self.candidates) self._trainClassifiers() if self.realTest: self._saveClassifiers() def __loadDataSet(self): for i, dataset in enumerate(self.dataSets): print("{}: {}".format(i, dataset)) if self.realTest: self.choice = int( input("\nPlease select an annotated TRAIN dataset: ")) else: self.choice = int( input("\nPlease select an annotated TRAIN/DEVTEST dataset: ")) with open("data/" + self.dataSets[self.choice] + "/sanitizedAnnotation.json") as jsonFile: self.annotation = json.load(jsonFile) with open("data/" + self.dataSets[self.choice] + "/sanitizedEventCandidates.json") as jsonFile: self.candidates = json.load(jsonFile) if self.realTest: print() for i, dataset in enumerate(self.dataSets): print("{}: {}".format(i, dataset)) choice = int(input("\nPlease select an annotated TEST dataset: ")) with open("data/" + self.dataSets[choice] + "/sanitizedEventCandidates.json") as jsonFile: self.testCandidates = json.load(jsonFile) #add to annotation file with open("data/" + self.dataSets[choice] + "/sanitizedAnnotation.json") as jsonFile: self.testAnnotation = json.load(jsonFile) def _saveClassifiers(self): print("\nSaving the category and event classifier...") with open( "data/" + self.dataSets[self.choice] + "/categoryClassifier.bin", "wb") as f: pickle.dump(self.classifierA, f) with open( "data/" + self.dataSets[self.choice] + "/eventClassifier.bin", "wb") as f: pickle.dump(self.classifierB, f) def _selectDataset(self): dataset = [] for h in self.candidates: for t in self.candidates[h]: dataset.append((self.candidates[h][t], self.eventType(h, t))) if self.realTest: # use all of the annotated train data to train self.trainData = dataset dataset = [] for h in self.testCandidates: for t in self.testCandidates[h]: dataset.append( (self.testCandidates[h][t], self.eventType(h, t))) self.testData = dataset else: random.shuffle(dataset) # random dataset splits for cross validation trainSplit = int(0.8 * len(dataset)) self.trainData = dataset[:trainSplit] self.testData = dataset[trainSplit:] def _trainClassifiers(self): print("\nClassifying events...\n") for i in range(self.ITERATIONS): if self.realTest: testMode = "TEST" else: testMode = "DEVTEST" print("###########") print("### {} {}".format(testMode, i + 1)) print("#############") self._selectDataset() self.testA = [] self.trainA = [] self.testB = [] self.trainB = [] #first train category classifier print( "### TRAINING STEP 1: Training category classifier (Naive Bayes with word features) ###" ) for candidate, label in self.testData: featuresA = self.featureSelector.getFeatures( candidate, self.classifierAFeatures) self.testA.append((featuresA, label)) for candidate, label in self.trainData: featuresA = self.featureSelector.getFeatures( candidate, self.classifierAFeatures) self.trainA.append((featuresA, label)) # MultinomialNB lijkt hier net zo goed als de nltk naive bayes classifier, maar is wel wat sneller self.classifierA = SklearnClassifier(MultinomialNB()).train( self.trainA) # sends the category classifier to the featureSelector self.featureSelector.addCategoryClassifier(self.classifierA) print( "### TRAINING STEP 2: Training event/non-event classifier (Naive Bayes with category & other features) ###" ) # second step train the event/no event classifier (a second category classifier) for candidate, label in self.testData: featuresB = self.featureSelector.getFeatures( candidate, self.classifierBFeatures) self.featureKeys = featuresB.keys() self.testB.append((featuresB, label)) for candidate, label in self.trainData: featuresB = self.featureSelector.getFeatures( candidate, self.classifierBFeatures) self.featureKeys = featuresB.keys() self.trainB.append((featuresB, label)) self.classifierB = nltk.NaiveBayesClassifier.train(self.trainB) self.calculateStats(i) self.printStats() def resultDictionary(self): return defaultdict(list) def calculateStats(self, i): '''Function to calculate all stats''' #calculate cm for this iteration ref = [] tagged = [] for f, e in self.testB: ref.append(self.classifierB.classify(f)) tagged.append(e) self.cm.append(nltk.ConfusionMatrix(ref, tagged)) #self.informativeFeatures.append(self.classifierB.most_informative_features(10)) print() #calculate precision and recall for this iteration for each category refsets = defaultdict(set) testsets = defaultdict(set) #allCount = 0 #noEventCount = 0 for n, (feats, label) in enumerate(self.testB): #allCount += 1 #if label == "geen_event": # noEventCount += 1 refsets[label].add(n) observed = self.classifierB.classify(feats) # uncomment voor wel_event #if label != "geen_event": # refsets["wel_event"].add(n) #if observed != "geen_event": # testsets["wel_event"].add(n) testsets[observed].add(n) #print("Accuracy geen_event (baseline) is", noEventCount/allCount) # self.accuracy.append( nltk.classify.accuracy(self.classifierB, self.testB)) #for elke category precision and recall berekenen. for category in self.categories: if category in testsets: self.result[category]["p"].append( nltk.metrics.precision(refsets[category], testsets[category])) self.result[category]["r"].append( nltk.metrics.recall(refsets[category], testsets[category])) self.result[category]["f"].append( nltk.metrics.f_measure(refsets[category], testsets[category])) else: self.result[category]["p"].append(float(0)) self.result[category]["r"].append(float(0)) self.result[category]["f"].append(float(0)) def eventType(self, geohash, timestamp): # return values {strings gebruiken?} eventTypes = { 0: "geen_event", 1: "sport", 2: "entertainment", 3: "bijeenkomst", 4: "incident", 5: "anders" } try: returnValue = eventTypes[self.annotation[geohash][timestamp]] except KeyError: returnValue = eventTypes[self.testAnnotation[geohash][timestamp]] return returnValue def printStats(self): print(", ".join(self.classifierBFeatures)) it = self.ITERATIONS print("### EVALUATION STEP 1: Detailed statistics for the classifier:") for i in range(it): if self.realTest: testMode = "TEST" else: testMode = "DEVTEST" print("\n###########") print("### {} {}".format(testMode, i + 1)) print("#############\n") print(self.cm[i]) print("Most informative features") # print(self.informativeFeatures[i]) print( "\n### EVALUATION STEP 2: Classification using features: {} | training set size: {} & test set size: {}\n" .format(", ".join(self.featureKeys), len(self.trainB), len(self.testB))) headers = ['#', 'accuracy'] + self.categories prf = "P R F" table = [['', '', prf, prf, prf, prf, prf, prf]] for i in range(it): row = [i + 1, round(self.accuracy[i], 2)] for category in self.categories: value = "{:.2f} {:.2f} {:.2f}".format( self.customRound(self.result[category]["p"][i], 2), self.customRound(self.result[category]["r"][i], 2), self.customRound(self.result[category]["f"][i], 2)) row.extend([value]) table.append(row) #averages row = ["Avg.", round(sum(self.accuracy) / len(self.accuracy), 2)] for category in self.categories: value = "{:.2f} {:.2f} {:.2f}".format( self.customAvg(self.result[category]["p"]), self.customAvg(self.result[category]["r"]), self.customAvg(self.result[category]["f"])) row.extend([value]) table.append(row) print(tabulate.tabulate(table, headers=headers)) print("\nLATEX table\n") print(tabulate.tabulate(table, headers=headers, tablefmt="latex")) def customAvg(self, l): try: returnValue = round(sum(l) / len(l), 2) except TypeError: returnValue = 0.0 return returnValue def customRound(self, n, d): try: returnValue = round(n, d) except TypeError: returnValue = 0.0 return returnValue
def processor(self): pre_processor = PreProcessor() feature_extractor = FeatureExtractor() feature_selector = FeatureSelector() accuracy_checker = AccuracyChecker() y_train, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_test_sj, x_test_iq = self.read_data( ) x_train_sj = pre_processor.impute_redundant_features( x_train_sj, self.impute_columns) x_train_iq = pre_processor.impute_redundant_features( x_train_iq, self.impute_columns) x_test_sj = pre_processor.impute_redundant_features( x_test_sj, self.impute_columns) x_test_iq = pre_processor.impute_redundant_features( x_test_iq, self.impute_columns) imputer_sj = Imputer(strategy='mean') x_train_sj = pre_processor.impute_missing_values( x_train_sj, self.features, imputer_sj) x_test_sj = pre_processor.impute_missing_values( x_test_sj, self.features, imputer_sj) imputer_iq = Imputer(strategy='mean') x_train_iq = pre_processor.impute_missing_values( x_train_iq, self.features, imputer_iq) x_test_iq = pre_processor.impute_missing_values( x_test_iq, self.features, imputer_iq) x_train_sj = feature_extractor.add_time_series_features(x_train_sj, window=100) x_train_iq = feature_extractor.add_time_series_features(x_train_iq, window=30) x_test_sj = feature_extractor.add_time_series_features(x_test_sj, window=100) x_test_iq = feature_extractor.add_time_series_features(x_test_iq, window=30) x_train_sj = feature_selector.drop_unnecessary_features( x_train_sj, self.drop_features, self.time_series_features) x_train_iq = feature_selector.drop_unnecessary_features( x_train_iq, self.drop_features, self.time_series_features) x_test_sj = feature_selector.drop_unnecessary_features( x_test_sj, self.drop_features, self.time_series_features) x_test_iq = feature_selector.drop_unnecessary_features( x_test_iq, self.drop_features, self.time_series_features) features_to_normalize = self.features + self.new_features x_train_sj[features_to_normalize] = x_train_sj[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_train_iq[features_to_normalize] = x_train_iq[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_test_sj[features_to_normalize] = x_test_sj[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_test_iq[features_to_normalize] = x_test_iq[ features_to_normalize].apply(pre_processor.normalize, axis=0) x_train = pd.concat([x_train_sj, x_train_iq], axis=0) x_train.set_index('index', inplace=True) x_sj, y_sj = x_train.loc[x_train.city == 'sj', :], y_train.loc[ x_train.city == 'sj', :] x_iq, y_iq = x_train.loc[x_train.city == 'iq', :], y_train.loc[ x_train.city == 'iq', :] x_train_sj, x_cross_sj, y_train_sj, y_cross_sj = train_test_split( x_sj, y_sj, test_size=0.2, stratify=x_sj.weekofyear) x_train_iq, x_cross_iq, y_train_iq, y_cross_iq = train_test_split( x_iq, y_iq, test_size=0.2, stratify=x_iq.weekofyear) x_train_sj = feature_selector.select_features(x_train_sj, self.features, self.new_features) x_train_iq = feature_selector.select_features(x_train_iq, self.features, self.new_features) x_cross_sj = feature_selector.select_features(x_cross_sj, self.features, self.new_features) x_cross_iq = feature_selector.select_features(x_cross_iq, self.features, self.new_features) reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=500, random_state=67) reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=300, random_state=67) reg_sj_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) reg_iq_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor( reg_sj_gb, reg_iq_gb, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_cross_sj, x_cross_iq, "gb") y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor( reg_sj_rf, reg_iq_rf, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_cross_sj, x_cross_iq, "rf") y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2, y_iq_pred_m1, y_iq_pred_m2, 5, 3) print("San Juan:") accuracy_checker.cross_validate_out_of_sample(y_sj_pred, y_cross_sj.total_cases) print("Iquitos:") accuracy_checker.cross_validate_out_of_sample(y_iq_pred, y_cross_iq.total_cases) predict_sj = x_test_sj[self.keys].copy() predict_iq = x_test_iq[self.keys].copy() x_sj = feature_selector.select_features(x_sj, self.features, self.new_features) x_iq = feature_selector.select_features(x_iq, self.features, self.new_features) x_test_sj = feature_selector.select_features(x_test_sj, self.features, self.new_features) x_test_iq = feature_selector.select_features(x_test_iq, self.features, self.new_features) reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, n_estimators=500, random_state=67) reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=300, random_state=67) reg_sj_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) reg_iq_rf = RandomForestRegressor(max_depth=None, n_estimators=700, random_state=67) y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor( reg_sj_gb, reg_iq_gb, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq, "gb") y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor( reg_sj_rf, reg_iq_rf, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq, "rf") y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2, y_iq_pred_m1, y_iq_pred_m2, 5, 3) predict_sj['total_cases'] = y_sj_pred.round().astype(int) predict_iq['total_cases'] = y_iq_pred.round().astype(int) predict_df = pd.concat([predict_sj, predict_iq], axis=0) predict_df.loc[predict_df.total_cases < 0, 'total_cases'] = 0 self.write_results(predict_df)
Y_test = pd.factorize(test[labelName])[0] X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy() scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X_train_origin) #scaling of training data X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()), columns=X_train_origin.columns) # apply same transformation to test data X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()), columns=X_test_origin.columns) trainTmp = X_train_origin.copy() trainTmp[labelName] = Y_train fs = FeatureSelector(trainTmp) featureSize = data.columns.size threshold = 10 clfNames = [ "lbfgs", "adam", "sgd", "randomForest", "decisionTree", "rbf", "poly", "linear", "knn" ] while (featureSize >= threshold): features = fs.featureSelectionSelectKBestClassification( featureSize, labelName) print(features) clfs = [ MLPClassifier(solver='lbfgs',
def fitInternal(self, currentData, parent_id, parent_edge, parent_edge_operator, parent_data_type): #print("...... Start Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... \n") start = time.time() ###### Check terminate condition # 1. when dataset is empty if (currentData.shape[0] == 1): print("...... Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... Done!!!\n") print("Got empty dataset\n") return None curr_target, curr_target_count = np.unique(currentData[1:, -1].astype( np.float64), return_counts=True) targetCompostion = None # composition number of unique target. we need this to calculate error training for count in curr_target_count: if (targetCompostion != None): targetCompostion += "|" + str(count) else: targetCompostion = str(count) self.cur_num_node += 1 cur_idx = self.cur_num_node # 2. when target is already homogen if (len(curr_target) == 1): self.tree.append([ cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge, parent_edge_operator, 0.0, curr_target[0], targetCompostion, None, parent_data_type ]) print("...... Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... Done!!!\n") return # 3. when no target or attribute if (currentData.shape[1] < 2): max_count, selected_class = self.selectMajorityClass( curr_target, curr_target_count) self.tree.append([ cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge, parent_edge_operator, 0.0, selected_class, targetCompostion, None, parent_data_type ]) print("...... Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... Done!!!\n") return ###### Check terminate condition ## select a feature as current node splitter = FeatureSelector(self.pool_size, self.attributes_info, currentData, curr_target) splitter.doSelect() cur_attr_name = splitter.selected_attr_name cur_attr_idx = splitter.selected_attr_idx cur_threshold = splitter.selected_splitter cur_gain = splitter.selected_gain cur_split_info = splitter.selected_split_info thresholdCompostion = None # composition number of unique edge of selected/current node. we need this to calculate error training if ( cur_threshold != None and len(cur_threshold) == 1 ): # if feature is numeric feature. Assuming feature that has one unique edge is numeric feature. cur_feature_type = 'numeric' no_header_data = currentData[1:, :] cur_threshold = cur_threshold[0] # 1. recursively fit left child less_eq_data = no_header_data[np.where( no_header_data[:, cur_attr_idx].astype( np.float64) <= cur_threshold.astype(np.float64))] less_eq_data = np.concatenate(([currentData[0, :]], less_eq_data), axis=0) XXY_new2 = np.delete(less_eq_data, cur_attr_idx, 1) thresholdCompostion = str(XXY_new2.shape[0] - 1) self.fitInternal(XXY_new2, cur_idx, cur_threshold, '<=', cur_feature_type) #print("left child fit done. parent node: " + str(cur_attr_name)+"\n") # 2. recursively fit right child greater_data = no_header_data[np.where( no_header_data[:, cur_attr_idx].astype( np.float64) > cur_threshold.astype(np.float64))] greater_data = np.concatenate(([currentData[0, :]], greater_data), axis=0) XXY_new2 = np.delete(greater_data, cur_attr_idx, 1) thresholdCompostion += "|" + str(XXY_new2.shape[0] - 1) self.fitInternal(XXY_new2, cur_idx, cur_threshold, '>', cur_feature_type) #print("right child fit done. parent node: " + str(cur_attr_name)+"\n") elif ( cur_threshold != None ): # if feature is discrete feature. Assuming feature that has more than one unique edge is discrete feature. cur_feature_type = 'nominal' no_header_data = currentData[1:, :] cur_threshold_str = None for i in range(len(cur_threshold)): cur_threshold_str = str( cur_threshold[i] ) if cur_threshold_str == None else cur_threshold_str + "|" + str( cur_threshold[i]) selected_dataset = no_header_data[np.where( no_header_data[:, cur_attr_idx] == cur_threshold[i])] selected_dataset = np.concatenate( ([currentData[0, :]], selected_dataset), axis=0) selected_dataset = np.delete(selected_dataset, cur_attr_idx, 1) thresholdCompostion = str( selected_dataset.shape[0] - 1 ) if thresholdCompostion == None else thresholdCompostion + "|" + str( selected_dataset.shape[0] - 1) self.fitInternal(selected_dataset, cur_idx, cur_threshold[i], cur_feature_type) cur_threshold = cur_threshold_str print("left child fit done. parent node: " + str(cur_attr_name) + "\n") else: print("cur idx: " + str(cur_idx)) print(currentData) max_count, selected_class = self.selectMajorityClass( curr_target, curr_target_count) self.tree.append([ cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge, parent_edge_operator, 0.0, selected_class, targetCompostion, None, parent_data_type ]) return # save tree to matrix. format : [treeIdx, parentId, attrName, colIndex, gain, splitInfo, parentEdge, threshold, leaveVal, targetCompostion, thresholdCompostion, cur_feature_type] col_idx = None if cur_attr_name == None else self.attributes_info[ cur_attr_name]['col_idx'] self.tree.append([ cur_idx, parent_id, cur_attr_name, col_idx, cur_gain, cur_split_info, parent_edge, parent_edge_operator, cur_threshold, None, targetCompostion, thresholdCompostion, cur_feature_type ]) print("...... Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... Done in " + str(time.time() - start) + "!!!\n")
class ClassifierCreator: def __init__(self): self.dataSets = os.listdir('data/') self.categories = ["geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event self.classifierAFeatures = ['wordFeatures'] self.classifierBFeatures = ['category', 'location','wordOverlapSimple','wordOverlapUser'] self.annotation = {} self.candidates = {} self.result = defaultdict(self.resultDictionary) self.cm = [] self.informativeFeatures = [] self.accuracy = [] self.choice = 0 # real test or dev test? self.realTest = False if len(sys.argv) == 2: if sys.argv[1] == "-test": self.realTest = True if self.realTest: print("\nThe system is running in TEST mode.\n") self.ITERATIONS = 1 else: print("\nThe system is running in DEVTEST mode.\n") self.ITERATIONS = 10 self.__loadDataSet() self.featureSelector = FeatureSelector(self.candidates) self._trainClassifiers() if self.realTest: self._saveClassifiers() def __loadDataSet(self): for i, dataset in enumerate(self.dataSets): print("{}: {}".format(i, dataset)) if self.realTest: self.choice = int(input("\nPlease select an annotated TRAIN dataset: ")) else: self.choice = int(input("\nPlease select an annotated TRAIN/DEVTEST dataset: ")) with open("data/" + self.dataSets[self.choice] + "/sanitizedAnnotation.json") as jsonFile: self.annotation = json.load(jsonFile) with open("data/" + self.dataSets[self.choice] + "/sanitizedEventCandidates.json") as jsonFile: self.candidates = json.load(jsonFile) if self.realTest: print() for i, dataset in enumerate(self.dataSets): print("{}: {}".format(i, dataset)) choice = int(input("\nPlease select an annotated TEST dataset: ")) with open("data/" + self.dataSets[choice] + "/sanitizedEventCandidates.json") as jsonFile: self.testCandidates = json.load(jsonFile) #add to annotation file with open("data/" + self.dataSets[choice] + "/sanitizedAnnotation.json") as jsonFile: self.testAnnotation = json.load(jsonFile) def _saveClassifiers(self): print("\nSaving the category and event classifier...") with open("data/" + self.dataSets[self.choice] + "/categoryClassifier.bin", "wb") as f: pickle.dump(self.classifierA,f) with open("data/" + self.dataSets[self.choice] + "/eventClassifier.bin", "wb") as f: pickle.dump(self.classifierB,f) def _selectDataset(self): dataset = [] for h in self.candidates: for t in self.candidates[h]: dataset.append( (self.candidates[h][t], self.eventType(h,t) ) ) if self.realTest: # use all of the annotated train data to train self.trainData = dataset dataset = [] for h in self.testCandidates: for t in self.testCandidates[h]: dataset.append( (self.testCandidates[h][t], self.eventType(h,t) ) ) self.testData = dataset else: random.shuffle(dataset) # random dataset splits for cross validation trainSplit = int(0.8 * len(dataset)) self.trainData = dataset[:trainSplit] self.testData = dataset[trainSplit:] def _trainClassifiers(self): print("\nClassifying events...\n") for i in range(self.ITERATIONS): if self.realTest: testMode = "TEST" else: testMode = "DEVTEST" print("###########") print("### {} {}".format(testMode,i+1)) print("#############") self._selectDataset() self.testA = [] self.trainA = [] self.testB = [] self.trainB = [] #first train category classifier print("### TRAINING STEP 1: Training category classifier (Naive Bayes with word features) ###") for candidate, label in self.testData: featuresA = self.featureSelector.getFeatures(candidate, self.classifierAFeatures) self.testA.append((featuresA, label)) for candidate, label in self.trainData: featuresA = self.featureSelector.getFeatures(candidate, self.classifierAFeatures) self.trainA.append((featuresA, label)) # MultinomialNB lijkt hier net zo goed als de nltk naive bayes classifier, maar is wel wat sneller self.classifierA = SklearnClassifier(MultinomialNB()).train(self.trainA) # sends the category classifier to the featureSelector self.featureSelector.addCategoryClassifier(self.classifierA) print("### TRAINING STEP 2: Training event/non-event classifier (Naive Bayes with category & other features) ###") # second step train the event/no event classifier (a second category classifier) for candidate, label in self.testData: featuresB = self.featureSelector.getFeatures(candidate, self.classifierBFeatures) self.featureKeys = featuresB.keys() self.testB.append((featuresB, label)) for candidate, label in self.trainData: featuresB = self.featureSelector.getFeatures(candidate, self.classifierBFeatures) self.featureKeys = featuresB.keys() self.trainB.append((featuresB, label)) self.classifierB = nltk.NaiveBayesClassifier.train(self.trainB) self.calculateStats(i) self.printStats() def resultDictionary(self): return defaultdict(list) def calculateStats(self, i): '''Function to calculate all stats''' #calculate cm for this iteration ref = [] tagged =[] for f, e in self.testB: ref.append(self.classifierB.classify(f)) tagged.append(e) self.cm.append(nltk.ConfusionMatrix(ref, tagged)) #self.informativeFeatures.append(self.classifierB.most_informative_features(10)) print() #calculate precision and recall for this iteration for each category refsets = defaultdict(set) testsets = defaultdict(set) #allCount = 0 #noEventCount = 0 for n, (feats, label) in enumerate(self.testB): #allCount += 1 #if label == "geen_event": # noEventCount += 1 refsets[label].add(n) observed = self.classifierB.classify(feats) # uncomment voor wel_event #if label != "geen_event": # refsets["wel_event"].add(n) #if observed != "geen_event": # testsets["wel_event"].add(n) testsets[observed].add(n) #print("Accuracy geen_event (baseline) is", noEventCount/allCount) # self.accuracy.append(nltk.classify.accuracy(self.classifierB,self.testB)) #for elke category precision and recall berekenen. for category in self.categories: if category in testsets: self.result[category]["p"].append(nltk.metrics.precision(refsets[category], testsets[category])) self.result[category]["r"].append(nltk.metrics.recall(refsets[category], testsets[category])) self.result[category]["f"].append(nltk.metrics.f_measure(refsets[category], testsets[category])) else: self.result[category]["p"].append(float(0)) self.result[category]["r"].append(float(0)) self.result[category]["f"].append(float(0)) def eventType(self,geohash,timestamp): # return values {strings gebruiken?} eventTypes = {0:"geen_event", 1:"sport", 2:"entertainment", 3:"bijeenkomst", 4:"incident", 5:"anders"} try: returnValue = eventTypes[self.annotation[geohash][timestamp]] except KeyError: returnValue = eventTypes[self.testAnnotation[geohash][timestamp]] return returnValue def printStats(self): print(", ".join(self.classifierBFeatures)) it = self.ITERATIONS print("### EVALUATION STEP 1: Detailed statistics for the classifier:") for i in range(it): if self.realTest: testMode = "TEST" else: testMode = "DEVTEST" print("\n###########") print("### {} {}".format(testMode,i+1)) print("#############\n") print(self.cm[i]) print("Most informative features") # print(self.informativeFeatures[i]) print("\n### EVALUATION STEP 2: Classification using features: {} | training set size: {} & test set size: {}\n".format(", ".join(self.featureKeys),len(self.trainB), len(self.testB))) headers = ['#', 'accuracy'] + self.categories prf = "P R F" table = [ ['', '', prf, prf,prf,prf,prf,prf]] for i in range(it): row = [i + 1, round(self.accuracy[i],2)] for category in self.categories: value = "{:.2f} {:.2f} {:.2f}".format(self.customRound(self.result[category]["p"][i],2), self.customRound(self.result[category]["r"][i],2), self.customRound(self.result[category]["f"][i],2)) row.extend( [value] ) table.append(row) #averages row = ["Avg.", round(sum(self.accuracy) / len(self.accuracy),2)] for category in self.categories: value = "{:.2f} {:.2f} {:.2f}".format(self.customAvg(self.result[category]["p"]), self.customAvg(self.result[category]["r"]), self.customAvg(self.result[category]["f"])) row.extend( [value] ) table.append(row) print(tabulate.tabulate(table, headers=headers)) print("\nLATEX table\n") print(tabulate.tabulate(table, headers=headers, tablefmt="latex")) def customAvg(self, l): try: returnValue = round(sum(l) / len(l),2) except TypeError: returnValue = 0.0 return returnValue def customRound(self,n, d): try: returnValue = round(n,d) except TypeError: returnValue = 0.0 return returnValue