def train(self, features_file_path, save_dir_path): train_features, train_labels, test_features, test_labels = data_loading.load_numerical_data(features_file_path, normalise=False) MultiLabelBinarizer.set_params(range(0, 16)) mlb = MultiLabelBinarizer() train_labels = np.array(train_labels) # Used to create a baseline for random chance # np.random.shuffle(train_labels) train_labels = mlb.fit_transform(np.array(train_labels)) test_labels = mlb.fit_transform(np.array(test_labels)) # Reserve the first 4 tracks in test set for displaying predictions to dev predict_features = test_features[:4] predict_labels = test_labels[:4] test_features = test_features[4:] test_labels = test_labels[4:] print('Training RF...') model = sklearn.ensemble.RandomForestClassifier(verbose=1) model.fit(train_features, train_labels) print('Evaluating RF...') print('Accuracy: ' + str(model.score(test_features, test_labels))) print() print('Model Predictions:') print(np.array([[int(s) for s in x] for x in model.predict(predict_features)])) print() print('Correct Labels:') print(predict_labels) pickle.dump(model, open(os.path.join(save_dir_path, "model.pickle"), 'wb'))
class DataProcess(object): # 特征处理 def __init__(self, process_type): self.process_type = process_type if self.process_type == "Binary": # 二值化处理 self.processmodule = Binarizer(copy=True, threshold=0.0) # 大于 threshold 的映射为1, 小于 threshold 的映射为0 elif self.process_type == "MinMax": # 归一化处理 self.processmodule = MinMaxScaler(feature_range=(0, 1), copy=True) elif self.process_type == "Stand": # 标准化处理 self.processmodule = StandardScaler(copy=True, with_mean=True, with_std=True) elif self.process_type == "Normal": # 正则化处理 self.processmodule = Normalizer(copy=True, norm="l2") # 可选择l1, max ,l2三种 elif self.process_type == "MultiLabelBinar": # 多标签二值化处理 self.processmodule = MultiLabelBinarizer(sparse_output=False) # 使用其他CRS格式使用True else: raise ValueError("please select a correct process_type") def fit_transform(self, data): return self.processmodule.fit_transform(data) def fit(self, data): self.processmodule.fit(data) def transform(self, data): self.processmodule.transform(data) def set_params(self, params): self.processmodule.set_params(**params) def get_params(self): return self.processmodule.get_params(deep=True) def get_classes(self): assert self.process_type in {"MultiLabelBinar"} return self.processmodule.classes_ # 输出相关的classs有哪些不同的值 def invser_transform(self, data): assert self.process_type in {"MultiLabelBinar", "MinMax", "Stand"} return self.processmodule.inverse_transform(data) def get_max(self): # 获取数组中所多有维度上的最大值与最小值 assert self.process_type in {"MinMax", "Stand"} return self.processmodule.data_max_ def get_min(self): assert self.process_type in {"MinMax", "Stand"} return self.processmodule.data_min_ def partial_fit(self): # 使用最后的一个缩放函数来在线计算最大值与最小值 assert self.process_type in {"MinMax", "Stand"} return self.processmodule.partial_fit()
def classify(self, features): model = pickle.load(open("classification\\numerical\\random_forest\\model\\model.pickle", 'rb')) result = model.predict(features) # todo put this functionality into the common classifier template MultiLabelBinarizer.set_params(range(0, 16)) mlb = MultiLabelBinarizer() mlb.fit([range(0, 16)]) genre_predictions_categorized = mlb.inverse_transform(result) if len(genre_predictions_categorized) == 0 or not all(genre_predictions_categorized): return ["Unclassifiable"] genre_predictions_categorized = [x[0] for x in mlb.inverse_transform(result)] # this needs checkinf for which value o fthe tuple is the actual value genre_predictions = [] lm = LabelManipulator() for label in genre_predictions_categorized: genre_predictions.append(lm.uncategorise_genre(label)) # convert the ids to names return genre_predictions
return np.array(new_features), np.array(new_labels) # Test accuracy with only best populated genres -> Experimental (38k), Electronic (34k), Rock (33k) def cut_all_but_3_genres(train_features, train_labels, test_features, test_labels): train_features, train_labels = cut_genres_from_list(train_features, train_labels) test_features, test_labels = cut_genres_from_list(test_features, test_labels) return train_features, train_labels, test_features, test_labels train_features, train_labels, test_features, test_labels = data_loading.load_numerical_data(normalise=False) # train_features, train_labels, test_features, test_labels = cut_all_but_3_genres(train_features, train_labels, test_features, test_labels) MultiLabelBinarizer.set_params(range(0, 16)) mlb = MultiLabelBinarizer() train_labels = np.array(train_labels) np.random.shuffle(train_labels) train_labels = mlb.fit_transform(np.array(train_labels)) test_labels = mlb.fit_transform(np.array(test_labels)) predict_features = test_features[:4] predict_labels = test_labels[:4] test_features = test_features[4:] test_labels = test_labels[4:] print('Training RF...') model = RandomForestClassifier(verbose=1) model.fit(train_features, train_labels)