def Encoding(data, general_matrix=None): encoder = LabelBinarizer() count = 0 # encoding for i in range(data.shape[1]): if type(data[0, i]) == str: count += 1 col = data[:, i] unique = np.unique(col if general_matrix is None else general_matrix[:, i]) try: encoder.fit(unique) except: pass new_col = encoder.transform(col) # split at i and i + 1 before, removed, after = np.hsplit(data, [i, i + 1]) # concatenate data = np.concatenate((before, new_col, after), axis=1) before, removed, after = np.hsplit(general_matrix, [i, i + 1]) general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1) print "count : %d" % count # return data return data
def one_hot_encoding(y_train, y_test): labelBinarizer = LabelBinarizer() labelBinarizer.fit(y_train) y_train_one_hot = labelBinarizer.transform(y_train) y_test_one_hot = labelBinarizer.transform(y_test) return y_train_one_hot, y_test_one_hot
def train(): tr, va, te = read_dataset('../mnist.pkl.gz') binarizer = LabelBinarizer().fit(range(10)) x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) keep_prob = tf.placeholder(tf.float32) preds = model.inference(x, keep_prob) loss, total_loss = model.loss(preds, y) acc = model.evaluation(preds, y) # learning rate: 0.1 train_op = model.training(total_loss, 0.1) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) for i in xrange(10000): batch_xs, batch_ys = tr.next_batch(50) if i % 100 == 0: train_acc = acc.eval(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 1.0}, session=sess) print "step: {0}, training accuracy {1}".format(i, train_acc) validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess) print("Validation accuracy : {0}".format(validation_accuracy)) train_op.run(feed_dict={ x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5}, session=sess) test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess) print("Test accuracy : ", test_accuracy)
class NN_Classifier(NNBase): def __init__(self,layers = [], lr=0.01, epochs=None, noisy=None, verbose=False): super(NN_Classifier, self).__init__(layers=layers, lr=lr, epochs=epochs, noisy=noisy, verbose=verbose) self.type = 'C' self.error_func = CrossEntropyError self.accuracy_score = AccuracyScore self.label_binarizer = LabelBinarizer() def predict(self, X): predictions = [] for el in X: current_prediction = NNBase._predict(self, row(el)) predictions.append(current_prediction) predictions = np.vstack(predictions) current_results = coalesce(predictions) return self.label_binarizer.inverse_transform(current_results) def predict_proba(self, X): predictions = [] for el in X: current_prediction = NNBase._predict(self, row(el)) predictions.append(current_prediction) predictions = np.vstack(predictions) return predictions def fit(self, X, T): T_impl = self.label_binarizer.fit_transform(T) if not self.epochs: self.epochs = 1 for num in xrange(self.epochs): if self.verbose: print "Epoch: %d" % num for i in xrange(len(X)): NNBase._update(self, row(X[i]), row(T_impl[i])) def error(self, X, T): T_impl = self.label_binarizer.transform(T) Y = self.predict_proba(X) return self.error_func.func(Y, T_impl) def score(self, X, T): Y = self.predict(X) return self.accuracy_score.func(Y,T) def analytical_gradient(self, X, T): T_impl = self.label_binarizer.transform(T) return NNBase._analytical_gradient(self, X, T_impl) def numerical_gradient(self, X, T): T_impl = self.label_binarizer.transform(T) return NNBase._numerical_gradient(self, X, T_impl)
def partb(): def load(file_name): file = np.load(file_name) X_train =file['X_train'].T y_train =file['y_train'] X_test =file['X_test'].T y_test =file['y_test'] X_cv =file['X_cv'].T y_cv =file['y_cv'] return X_train,y_train,X_cv,y_cv,X_test,y_test train_ = [0,0] test_ = [0,0] overall = [] for i in range(14): X_train,y_train,X_cv,y_cv,X_test,y_test = load('pofa{}.npz'.format(i)) from sklearn.preprocessing import LabelBinarizer binarizer = LabelBinarizer() binarizer.fit(y_train) Y_train = binarizer.transform(y_train).T Y_cv = binarizer.transform(y_cv).T #nn.forward(X) #nn.backprop(X,Y,graient_check=True) print(X_train.shape[0], Y_train.shape[0]) nn = NeuralNetwork([X_train.shape[0],30,Y_train.shape[0]], functions=[sigmoid,softmax], derivatives=[derivative_sigmoid]) nn.fit(X_train,Y_train,eta=0.01,momentum=0.5,minibatch=16,regularizer=0.15,max_iter=200,gradient_check=False,cv = (X_cv,Y_cv),graphs=False, lbfgs=False) output = nn.forward(X_train) y_train_output = binarizer.inverse_transform(output.T) y_test_output = binarizer.inverse_transform(nn.forward(X_test).T) print("Iteration: ",i) print((y_train_output==y_train).mean()) print((y_test_output ==y_test).mean()) overall.append((y_test == y_test_output).mean()) train_[0] += (y_train_output==y_train).sum() train_[1] += y_train.shape[0] test_[0] += (y_test_output==y_test).sum() test_[1] += y_test.shape[0] print("Average train accuracy: ", train_[0]/train_[1],"Average test accuracy: ",test_[0]/test_[1]) print(train_,test_) overall = np.array(overall) print(overall.mean())
def load_dataset(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() lb = LabelBinarizer() lb.fit(y) for y_bin in lb.transform(y).T: y = y_bin break for y_bin in lb.transform(y_test).T: y_test = y_bin break return X, y, X_test, y_test
def our_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! """ lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) # print "Y_true combined", y_true_combined # print "Y_pred combined", y_pred_combined tagset = set(lb.classes_) # print "tagset: ", tagset tagset = sorted(tagset) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels = [class_indices[cls] for cls in tagset], target_names = tagset )
def get_abalone19(): """Loads abalone dataset, maps gender feature to binary features, adds new label to create abalone19 imbalanced binary classification dataset.""" raw_data = pd.read_csv(ABALONE_FILE, sep=',') genders = list(raw_data.ix[:, 'gender']) cts_data = raw_data.drop(labels='gender', axis=1) # initialize & fit preprocesser lbz = LabelBinarizer() lbz.fit(genders) # encode categorical var encoded_genders = pd.DataFrame(lbz.transform(genders)) encoded_genders.columns = ['gender_' + k for k in lbz.classes_] # recombine encoded data & return new_data = pd.concat(objs=[encoded_genders, cts_data], axis=1) new_data['label'] = raw_data['rings'].map( lambda k: 1 if k > 10 else 0) # binary clf task new_data = new_data.drop('rings', axis=1) # standardize cts features if STANDARDIZE: for col in new_data.ix[:, 3:-1]: mean = new_data[col].mean() std = new_data[col].std() new_data[col] = new_data[col].map(lambda k: (k - mean) / float(std)) pos_recs = new_data['label'].sum() print 'total pos class pct = {} %\n'.format( round(100 * pos_recs / float(len(new_data)), 3)) return new_data
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) # Be sure to have at least one empty label y_true += ([],) y_pred += ([],) n_samples += 1 lb = LabelBinarizer().fit([range(n_classes)]) y_true_binary_indicator = lb.transform(y_true) y_pred_binary_indicator = lb.transform(y_pred) for name, metrics in METRICS_WITH_NORMALIZE_OPTION.items(): # List of list of labels measure = metrics(y_true, y_pred, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal( metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name ) # Indicator matrix format measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal( metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name, )
def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Note: This function was copied from http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb Args: y_true: True labels, list of strings y_pred: Predicted labels, list of strings Returns: classification report as string """ lbin = LabelBinarizer() y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred))) #tagset = set(lbin.classes_) - {NO_NE_LABEL} tagset = set(lbin.classes_) tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
def bio_classification_report(y_true, y_pred): """Evaluates entity extraction accuracy. Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb """ from sklearn.preprocessing import LabelBinarizer from itertools import chain from sklearn.metrics import classification_report lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )
class CategoricalToNumerical(object): def __init__(self, dimensionality_reducer=None, verify=True): pass """Takes in a dimensionality reducer in order to convert categorical features into numerical. """ if dimensionality_reducer is None: dimensionality_reducer = RandomizedPCA(1) self.dimensionality_reducer = dimensionality_reducer self.verify = verify self.binarizer = LabelBinarizer() def fit(self, X, y=None): self._verify(X, self.verify) binarized = self.binarizer.fit_transform(X) self.dimensionality_reducer.fit(binarized) def transform(self, X): self._verify(X, False) binarized = self.binarizer.transform(X) result = self.dimensionality_reducer.transform(binarized).flatten() assert X.shape == result.shape return result def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def _verify(self, X, verify): if verify: assert is_categorical(X) else: assert isinstance(X, np.ndarray) assert len(X.shape) == 1
def report(test_y, pred_y): lb = LabelBinarizer() test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y))) pred_y_combined = lb.transform(list(chain.from_iterable(pred_y))) tagset = sorted(set(lb.classes_)) class_indices = {cls: idx for idx, cls in enumerate(tagset)} print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
class BusinessCategoriesFeature(BaseEstimator): """ WARNING!!! Works only with a modified version of LabelBinarizer. A binarization of the reviews' business categories. """ def __init__(self, data=None): self.data = data def __create_labels_list(self, review_list): labels = [] for review in review_list: business = self.data.get_business_for_review(review) labels.append(business['categories']) return labels def fit(self, X, y): self.binarizer = LabelBinarizer() labels = self.__create_labels_list(X) self.binarizer.fit(labels) return self def transform(self, X): labels = self.__create_labels_list(X) binarized_labels = self.binarizer.transform(labels) return binarized_labels.astype(float)
def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly! """ lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} labs = [class_indices[cls] for cls in tagset] return((precision_recall_fscore_support(y_true_combined, y_pred_combined, labels=labs, average=None, sample_weight=None)), (classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset, )), labs)
def logloss(act, pred): epsilon = 10 ** -15 pred = np.maximum(np.minimum(pred, 1 - epsilon), epsilon) lb = LabelBinarizer() lb.fit(act) act_binary = lb.transform(act) logloss = - np.sum(np.multiply(act_binary, np.log(pred))) / pred.shape[0] return logloss
def fit(self, Xt, yt, Xh, yh, callback=None): lbin = LabelBinarizer() lbin.fit(yt) Yt_multi = lbin.transform(yt) Yh_multi = lbin.transform(yh) sample_weight_train = np.ones(Xt.shape[0]) sample_weight_test = np.ones(Xh.shape[0]) if Yt_multi.shape[1] == 1: Yt_multi = np.hstack([1 - Yt_multi, Yt_multi]) Yh_multi = np.hstack([1 - Yh_multi, Yh_multi]) print('warning: only two classes detected') n_classes = Yt_multi.shape[1] n_features = Xt.shape[1] if self.alpha0 is None: self.alpha0 = np.zeros(n_classes * n_features) # if not np.all(np.unique(yt) == np.array([-1, 1])): # raise ValueError x0 = np.zeros(n_features * n_classes) # assert x0.size == self.alpha0.size def h_func_grad(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_loss_grad( x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[:2] def h_hessian(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_grad_hess( x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[1] def g_func_grad(x, alpha): # x = x.reshape((-1,Yt_multi.shape[1])) return _multinomial_loss_grad( x, Xh, Yh_multi, np.zeros(alpha.size), sample_weight_test)[:2] def h_crossed(x, alpha): # return x.reshape((n_classes, -1)) * alpha # x = x.reshape((-1,Yt_multi.shape[1])) tmp = np.exp(alpha) * x return sparse.dia_matrix( (tmp, 0), shape=(n_features * n_classes, n_features * n_classes)) opt = hoag_lbfgs( h_func_grad, h_hessian, h_crossed, g_func_grad, x0, callback=callback, tolerance_decrease=self.tolerance_decrease, lambda0=self.alpha0, maxiter=self.max_iter, verbose=self.verbose) self.coef_ = opt[0] self.alpha_ = opt[1] return self
def load_dataset2(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() X, y = shuffle(X, y) lb = LabelBinarizer() lb.fit(y) for y_bin in lb.transform(y).T: return X, y_bin
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None): X_train = pd.read_csv(path+name) group_le = LabelEncoder() group_lb = LabelBinarizer() labels = group_le.fit_transform(X_train['group'].values) labels = group_lb.fit_transform(labels) del labels ################## # Phone Brand ################## # print("# Read Phone Brand") phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv', dtype={'device_id': np.str}) phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True) phone_brand_le = LabelEncoder() phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand']) device_model_le = LabelEncoder() phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model']) while 1: data = pd.read_csv(path+name,iterator=True,chunksize=batch_size, dtype={'device_id': np.str}) for X_train in data: X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True) phone_brand = X_train['phone_brand'].values device_model = X_train['device_model'].values X_train["app_lab"] = X_train["device_id"].map(events) y_train = X_train['group'].values X_train['gender'][X_train['gender']=='M']=1 X_train['gender'][X_train['gender']=='F']=0 y_train_gender = X_train['gender'].values y_train_age = X_train['age'].values # take log transformation y_train_age = np.log(y_train_age) X_train.fillna('0 ',inplace=True) y_train = group_le.transform(y_train) y_train = group_lb.transform(y_train) x_train = X_train["app_lab"].values x_train = [ x.split(' ') for x in x_train] for i in range(len(x_train)): x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')] x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_train = [x_train,phone_brand,device_model] y_train = [y_train,y_train_gender,y_train_age] yield (x_train,y_train)
def encode(self, data, label, value_set=None): le =LabelBinarizer() if value_set is None: encoded = le.fit_transform(data[label]) else: le.fit(value_set) encoded = le.transform(data[label]) for i in range(encoded.shape[1]): new_label = '{0}_is_{1}'.format(label, i) data[new_label] = encoded[:,i]
class _CategoricalEncoder: """OneHotEncoder that can handle categorical variables.""" def __init__(self): """Convert labeled categories into one-hot encoded features.""" self._lb = LabelBinarizer() def fit(self, X): """Fit a list or array of categories. Parameters ---------- * `X` [array-like, shape=(n_categories,)]: List of categories. """ self.mapping_ = {v: i for i, v in enumerate(X)} self.inverse_mapping_ = {i: v for v, i in self.mapping_.items()} self._lb.fit([self.mapping_[v] for v in X]) self.n_classes = len(self._lb.classes_) return self def transform(self, X): """Transform an array of categories to a one-hot encoded representation. Parameters ---------- * `X` [array-like, shape=(n_samples,)]: List of categories. Returns ------- * `Xt` [array-like, shape=(n_samples, n_categories)]: The one-hot encoded categories. """ return self._lb.transform([self.mapping_[v] for v in X]) def inverse_transform(self, Xt): """Inverse transform one-hot encoded categories back to their original representation. Parameters ---------- * `Xt` [array-like, shape=(n_samples, n_categories)]: One-hot encoded categories. Returns ------- * `X` [array-like, shape=(n_samples,)]: The original categories. """ Xt = np.asarray(Xt) return [ self.inverse_mapping_[i] for i in self._lb.inverse_transform(Xt) ]
def one_hot_encode(x): """ One hot encode a list of sample labels. Return a one-hot encoded vector for each label. : x: List of sample Labels : return: Numpy array of one-hot encoded labels """ # TODO: Implement Function labels=list(range(10)) lb = LabelBinarizer() lb.fit(labels) return np.array(lb.transform(x))
def ndcg_score(ground_truth, predictions, k=5): lb = LabelBinarizer() lb.fit(range(len(predictions) + 1)) T = lb.transform(ground_truth) scores = [] for y_true, y_score in zip(T, predictions): actual = dcg_score(y_true, y_score, k) best = dcg_score(y_true, y_true, k) score = float(actual) / float(best) scores.append(score) return np.mean(scores)
class PipelineLabelBinarizer(TransformerMixin): def __init__(self, *args, **kwargs): self.encoder = LabelBinarizer(*args, **kwargs) def fit(self, x, y=None): self.encoder.fit(x) return self def transform(self, x, y=None): return self.encoder.transform(x)
def test_averaging_multiclass(n_samples=50, n_classes=3): random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples,)) y_pred = random_state.randint(0, n_classes, size=(n_samples,)) y_score = random_state.uniform(size=(n_samples, n_classes)) lb = LabelBinarizer().fit(y_true) y_true_binarize = lb.transform(y_true) y_pred_binarize = lb.transform(y_pred) for name in METRICS_WITH_AVERAGING: yield (check_averaging, name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
def small_word_conv(dataset_path): docs, y, test_docs, test_y = nli2013_train_test_split(dataset_path) logging.info('preprocessing, padding and binarizing data ...') docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in docs] test_docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in test_docs] vocab = Dictionary(docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in docs], max_length=100, padding_word=0)) y = bin.fit_transform(y) test_x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) test_y = bin.transform(test_y) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(11, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x, y, batch_size=32, nb_epoch=10, validation_data=[test_x, test_y]) print(accuracy_score(np.argwhere(test_y)[:, 1], model.predict_classes(test_x)))
def evaluate(self, y_true, y_pred): lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {"O"} tagset = sorted(tagset, key=lambda tag: tag.split("-", 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset )
def _create_covertype(directory): urlbase = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/" destdir = os.path.join(_DATA_DIRECTORY, "raw") fn = _download_file(urlbase, "covtype.data.gz", destdir) with gzip.open(fn, "rb") as gzfile: X = pd.read_csv(gzfile, header=None).values X, y = X[:, :-1].astype(np.float64), X[:, -1] y -= 1 # make classes 0-based # split into test- and validationset idx = range(X.shape[0]) from sklearn.cross_validation import train_test_split X, Xtest, y, ytest = train_test_split(X, y, test_size=0.1) X, Xval, y, yval = train_test_split(X, y, test_size=0.25) from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() y = lb.fit_transform(y) yval = lb.transform(yval) ytest = lb.transform(ytest) # Most values are binary, except for these, so let's standardize them quant_idx = [0, 1, 2, 3, 4, 5, 9] # real numbers int_idx = [6, 7, 8] # integers from [0, 255) from sklearn.preprocessing import StandardScaler as Scaler scaler = Scaler() X[:, quant_idx + int_idx] = scaler.fit_transform(X[:, quant_idx + int_idx]) Xval[:, quant_idx + int_idx] = scaler.transform(Xval[:, quant_idx + int_idx]) Xtest[:, quant_idx + int_idx] = scaler.transform(Xtest[:, quant_idx + int_idx]) data = [["train", X, y], ["valid", Xval, yval], ["test", Xtest, ytest]] m = np.zeros(X.shape[1]) m[quant_idx + int_idx] = scaler.mean_ s = np.ones(X.shape[1]) s[quant_idx + int_idx] = scaler.std_ other = {"center": m, "scale": s} _store(data, os.path.join(_DATA_DIRECTORY, "covertype.hdf5"), other)
def test_averaging_multiclass(name): n_samples, n_classes = 50, 3 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples, )) y_pred = random_state.randint(0, n_classes, size=(n_samples, )) y_score = random_state.uniform(size=(n_samples, n_classes)) lb = LabelBinarizer().fit(y_true) y_true_binarize = lb.transform(y_true) y_pred_binarize = lb.transform(y_pred) check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
def get_classification_report(validation_y, validation_pred): """ Returns the classification report for the given classify. It uses predicts the labels of the validation set and uses that as a bases for testing the perfomance of the classifier """ lb = LabelBinarizer() val_y = lb.fit_transform(list(validation_y)) val_pred = lb.transform(list(validation_pred)) tagset = get_classnames() return classification_report(val_y,val_pred,target_names=tagset)
K.clear_session() model = Sequential() rbflayer = RBFLayer( g_param, initializer=InitCentersRandom( oDataSet.attributes[oData.Training_indexes[train]]), betas=g2_param, input_shape=(base.shape[1], )) model.add(rbflayer) model.add( Dense(len(oDataSet.labelsNames), activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer=_OPTIMIZER) model.fit(oDataSet.attributes[oData.Training_indexes[train]], lb.transform( oDataSet.labels[oData.Training_indexes[train]]), batch_size=50, epochs=epochs, verbose=0) y_pred = model.predict( oDataSet.attributes[oData.Training_indexes[test]]).argmax( axis=1) y_true = oDataSet.labels[oData.Training_indexes[test]] grid_result[g1, g2, k_slice] = accuracy_score(y_true, y_pred) print(grid_result) k_slice += 1 best_p = GRID_NEURON[np.unravel_index( np.argmax(np.mean(grid_result, axis=2)), grid_result.shape[:2])[0]] best_b = GRID_B[np.unravel_index(np.argmax(np.mean(grid_result, axis=2)), grid_result.shape[:2])[1]]
# converting data and labels to np array data = np.array(data, dtype="float") labels = np.array(labels) # scaling the values of data between 0 and 1 data = data / 255.0 # Split the training data into separate train and test sets (train_x, val_x, train_y, val_y) = train_test_split(data, labels, test_size=0.3, random_state=13) # one hot encoding lb = LabelBinarizer().fit(train_y) train_y = lb.transform(train_y) val_y = lb.transform(val_y) # building model model = Sequential() model.add( Conv2D(40, (5, 5), padding="same", input_shape=(40, 40, 1), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) model.add(Conv2D(100, (5, 5), padding="same", activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) model.add(Flatten()) model.add(Dense(128, activation="relu")) model.add(Dropout(0.3))
def nnCostFunction(nn_params, *args): """NNのコスト関数とその偏微分を求める""" in_size, hid_size, num_labels, X, y, lam = args # ニューラルネットの全パラメータを行列形式に復元 Theta1 = nn_params[0:(in_size + 1) * hid_size].reshape( (hid_size, in_size + 1)) Theta2 = nn_params[(in_size + 1) * hid_size:].reshape( (num_labels, hid_size + 1)) # パラメータの偏微分 Theta1_grad = np.zeros(Theta1.shape) Theta2_grad = np.zeros(Theta2.shape) # 訓練データ数 m = X.shape[0] # 訓練データの1列目にバイアス項に対応する1を追加 X = np.hstack((np.ones((m, 1)), X)) # 教師ラベルを1-of-K表記に変換 lb = LabelBinarizer() lb.fit(y) y = lb.transform(y) J = 0 for i in range(m): xi = X[i, :] yi = y[i] # forward propagation a1 = xi z2 = np.dot(Theta1, a1) a2 = sigmoid(z2) a2 = np.hstack((1, a2)) z3 = np.dot(Theta2, a2) a3 = sigmoid(z3) J += sum(-yi * safe_log(a3) - (1 - yi) * safe_log(1 - a3)) # backpropagation delta3 = a3 - yi delta2 = np.dot(Theta2.T, delta3) * sigmoidGradient(np.hstack((1, z2))) delta2 = delta2[1:] # バイアス項に対応する要素を除外 # ベクトル x ベクトル = 行列の演算をしなければならないので # 縦ベクトルへのreshapeが必要 # 行数に-1を指定すると自動的に入る delta2 = delta2.reshape((-1, 1)) delta3 = delta3.reshape((-1, 1)) a1 = a1.reshape((-1, 1)) a2 = a2.reshape((-1, 1)) # 正則化ありのときのデルタの演算 Theta1_grad += np.dot(delta2, a1.T) Theta2_grad += np.dot(delta3, a2.T) J /= m # 正則化項 temp = 0.0 for j in range(hid_size): for k in range(1, in_size + 1): # バイアスに対応する重みは加えない temp += Theta1[j, k]**2 for j in range(num_labels): for k in range(1, hid_size + 1): # バイアスに対応する重みは加えない temp += Theta2[j, k]**2 J += lam / (2.0 * m) * temp # 偏微分の正則化項 Theta1_grad /= m Theta1_grad[:, 1:] += (lam / m) * Theta1_grad[:, 1:] Theta2_grad /= m Theta2_grad[:, 1:] += (lam / m) * Theta2_grad[:, 1:] # ベクトルに変換 grad = np.hstack((np.ravel(Theta1_grad), np.ravel(Theta2_grad))) print "J =", J return J, grad
4.0""" return np.round(number * 2) / 2 min_val = -40 max_val = 40 y_train = round_of_rating(saturate(y_train, min_val, max_val)) r_int = 0.5 slist = np.arange(min_val, max_val + r_int, r_int) * 2 #multiply by 2 to allow labelbinarizer to work lb = LabelBinarizer() lb.fit(slist) ylabels = lb.transform(y_train * 2) # In[17]: print(x_train.shape) print(xfcss_train.shape) print(ylabels.shape) # In[18]: nsamps = x_train.shape[0] n80p = int(np.floor(nsamps * 0.8)) rannums = np.array(random.sample(range(1, nsamps, 1), n80p)) s_nfiles = np.arange(nsamps) test_set = np.setdiff1d(s_nfiles, rannums)
def task_1_tuttocompleto(df): print( "======================== task_1_tuttocompleto =============================" ) del df['entity_charOffset'] del df['entity_id'] print(df.shape) df2 = df.copy(deep=True) data = construct_dataset_tutto(df, df2) print('bitno', df.shape) #df nema izbrisan entity type headers2 = [ 'token_name', 'token_tag', 'sentence_id', 'sentence_text', 'entity_name' ] df2 = pd.DataFrame(data, columns=headers2) df_train, df_test = train_test_split(df2, test_size=0.2, random_state=22, shuffle=False) text_train = df_train['sentence_text'].as_matrix() text_test = df_test['sentence_text'].as_matrix() print('text_train.shape', text_train.shape) sw = stopwords.words("english") vectorizer = TfidfVectorizer(lowercase=True, binary=True, stop_words=sw, sublinear_tf=True, norm=None) x_train = vectorizer.fit_transform(text_train).toarray() x_test = vectorizer.transform(text_test).toarray() token_name_train = vectorizer.transform( df_train['token_name'].as_matrix()).toarray() token_name_test = vectorizer.transform( df_test['token_name'].as_matrix()).toarray() #this is an attempt to concatenate token tags to the dataset, memory leak problems #token_name_train = np.column_stack((token_name_train, df_train['token_tag'])) #token_name_test = np.column_stack((token_name_test, df_test['token_tag'])) #[:,None] x_train = np.concatenate((x_train, token_name_train), axis=1) x_test = np.concatenate((x_test, token_name_test), axis=1) del sw del vectorizer del token_name_train del token_name_test del df2 del data #return x_train, x_test, y_train, y_test, df_train['token_tag'], df_test['token_tag'] y_train = df_train['entity_name'].astype("category").cat.codes.as_matrix() y_test = df_test['entity_name'].astype("category").cat.codes.as_matrix() lb = LabelBinarizer() y_train = lb.fit_transform(y_train) y_test = lb.transform(y_test) pred = simple_nn(x_train, x_test, y_train, 5) #pred = lb.inverse_transform(pred) y_train = lb.inverse_transform(y_train) y_test = lb.inverse_transform(y_test) pred_list = [pred] print(accuracy_score(pred, y_test)) print(f1_score(pred, y_test, average='macro')) lgr = LogisticRegression(C=0.05, class_weight='balanced') lgr.fit(x_train, y_train) pred1 = lgr.predict(x_test) pred_list.append(pred1) print(accuracy_score(pred1, y_test)) print(f1_score(pred1, y_test, average='macro')) svc = LinearSVC(C=0.0004, class_weight='balanced') svc.fit(x_train, y_train) pred2 = svc.predict(x_test) pred_list.append(pred2) print(accuracy_score(pred2, y_test)) print(f1_score(pred2, y_test, average='macro')) #gb = ensemble.GradientBoostingClassifier() #gb.fit(x_train, y_train) #pred = gb.predict(x_test) #pred_list.append(pred) final_pred = [] for i in range(len(pred_list[0])): temp = [0, 0, 0, 0, 0] for j in range(len(pred_list)): temp[pred_list[j][i]] += 1 final_pred.append(np.argmax(temp)) pred = final_pred print(pred) print(accuracy_score(pred, y_test)) print(f1_score(pred, y_test, average='macro')) np.save('my_pred', pred) np.save('y_test', y_test) joblib.dump(lb, 'lb')
def class_report(y_true, y_pred, y_score, alpha, average='micro'): if y_true.shape != y_pred.shape: print("Error! y_true %s is not the same shape as y_pred %s" % (y_true.shape, y_pred.shape)) return lb = LabelBinarizer() if len(y_true.shape) == 1: lb.fit(y_true) #Value counts of predictions labels, cnt = np.unique(y_pred, return_counts=True) n_classes = len(labels) pred_cnt = pd.Series(cnt, index=labels) acc = accuracy_score(y_true=y_true, y_pred=y_pred) metrics_summary = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=labels) avg = list( precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, average='weighted')) metrics_sum_index = ['precision', 'recall', 'f1-score', 'support'] class_report_df = pd.DataFrame(list(metrics_summary), index=metrics_sum_index, columns=labels) support = class_report_df.loc['support'] total = support.sum() class_report_df['avg / total'] = avg[:-1] + [total] class_report_df = class_report_df.T class_report_df['pred'] = pred_cnt class_report_df['pred'].iloc[-1] = total """ matrix = confusion_matrix(y_true, y_pred) accs = matrix.diagonal() / matrix.sum(axis=1) print("accuracies") print(accs) """ if not (y_score is None): fpr = dict() tpr = dict() roc_auc = dict() auc_delong = dict() auc_ci = dict() auc_cov = dict() accs = dict() for label_it, label in enumerate(labels): fpr[label], tpr[label], _ = roc_curve( (y_true == label).astype(int), y_score[:, label_it]) y_true_imed = (y_true == label).astype(int) y_pred_imed = (y_pred == label).astype(int) y_score_imed = y_score[:, label_it] auc_dl, auc_co, ci = calculate_auc_ci(y_pred=y_pred_imed, y_true=y_true_imed, y_score=y_score_imed, alpha=alpha, print_results=False) auc_delong[label] = auc_dl auc_cov[label] = auc_co auc_ci[label] = ci accs[label] = accuracy_score(y_true=y_true_imed, y_pred=y_pred_imed) roc_auc[label] = auc(fpr[label], tpr[label]) if average == 'micro': if n_classes <= 2: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score[:, 1].ravel()) else: fpr["avg / total"], tpr["avg / total"], _ = roc_curve( lb.transform(y_true).ravel(), y_score.ravel()) roc_auc["avg / total"] = auc(fpr["avg / total"], tpr["avg / total"]) elif average == 'macro': # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in labels])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in labels: mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"]) accs["avg / total"] = np.mean(list(accs.values())) auc_delong["avg / total"] = "" #np.mean(list(auc_delong.values())) auc_cov["avg / total"] = "" #np.mean(list(auc_cov.values())) auc_ci["avg / total"] = "" class_report_df['accuracy'] = pd.Series(accs) class_report_df['AUC'] = pd.Series(roc_auc) class_report_df['AUC DeLong'] = pd.Series(auc_delong) class_report_df['AUC COV'] = pd.Series(auc_cov) class_report_df['AUC CI (' + str(alpha * 100) + ' %)'] = pd.Series(auc_ci) return class_report_df
pad = ['TOKEN'] * SERIES_LENGTH for index in range(len(inputs) - SERIES_LENGTH - 1): yield [ inputs[index], pad + inputs[max(0, index - SERIES_LENGTH):index] ] pad = pad[1:] training = [fizzbuzz(num) for num in range(1, 1000)] training_inputs = list(create_input_features(training)) lb = LabelBinarizer() lb.fit(training + ['TOKEN']) X = np.array( [lb.transform(features).flatten() for label, features in training_inputs]) y = np.array([label for label, features in training_inputs]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=6251) #clf = LogisticRegression(tol=1e-6) regr = RandomForestClassifier(max_depth=20, random_state=6251) regr.fit(X_train, y_train) from sklearn.metrics import accuracy_score predicted = regr.predict(X_test) accuracy_score(y_test, predicted)
def main(): """Train ensemble model. """ # construct the argument parse and parse the arguments args = argparse.ArgumentParser() args.add_argument("-o", "--output", required=True, help="path to output directory") args.add_argument("-m", "--models", required=True, help="path to output models directory") args.add_argument("-n", "--num-models", type=int, default=5, help="# of models to train") args = vars(args.parse_args()) # load the training and testing data, then scale it into the range [0, 1] ((train_x, train_y), (test_x, test_y)) = cifar10.load_data() train_x = train_x.astype("float") / 255.0 test_x = test_x.astype("float") / 255.0 # convert the labels from integers to vectors label_binarizer = LabelBinarizer() train_y = label_binarizer.fit_transform(train_y) test_y = label_binarizer.transform(test_y) # initialize the label names for the CIFAR-10 dataset label_names = [ "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck" ] # construct the image generator for data augmentation augmentation = ImageDataGenerator(rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest") # loop over the number of models to train for i in np.arange(0, args["num_models"]): # initialize the optimizer and model print("[INFO] training model {}/{}".format(i + 1, args["num_models"])) opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True) model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # train the network model_fit = model.fit_generator(augmentation.flow(train_x, train_y, batch_size=64), validation_data=(test_x, test_y), epochs=40, steps_per_epoch=len(train_x) // 64, verbose=1) # save the model to disk path = [args["models"], "model_{}.model".format(i)] model.save(os.path.sep.join(path)) # evaluate the network predictions = model.predict(test_x, batch_size=64) report = classification_report(test_y.argmax(axis=1), predictions.argmax(axis=1), target_names=label_names) # save the classification report to file path = [args["output"], "model_{}.txt".format(i)] f = open(os.path.sep.join(path), "w") f.write(report) f.close() # plot the training loss and accuracy path = [args["output"], "model_{}.png".format(i)] plt.style.use("ggplot") plt.figure() plt.plot(np.arange(0, 40), model_fit.history["loss"], label="train_loss") plt.plot(np.arange(0, 40), model_fit.history["val_loss"], label="val_loss") plt.plot(np.arange(0, 40), model_fit.history["acc"], label="train_acc") plt.plot(np.arange(0, 40), model_fit.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy for model {}".format(i)) plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.savefig(os.path.sep.join(path)) plt.close()
]) if not is_features_normal: train_features = normalize_grayscale(train_features) test_features = normalize_grayscale(test_features) is_features_normal = True print('Tests Passed!') # In[10]: if not is_labels_encod: # Turn labels into numbers and apply One-Hot Encoding encoder = LabelBinarizer() encoder.fit(train_labels) train_labels = encoder.transform(train_labels) test_labels = encoder.transform(test_labels) # Change to float32, so it can be multiplied against the features in TensorFlow, which are float32 train_labels = train_labels.astype(np.float32) test_labels = test_labels.astype(np.float32) is_labels_encod = True print('Labels One-Hot Encoded') # In[11]: assert is_features_normal, 'You skipped the step to normalize the features' assert is_labels_encod, 'You skipped the step to One-Hot Encode the labels' # Get randomized datasets for training and validation
class ShapeletModel(BaseEstimator, ClassifierMixin): """Learning Time-Series Shapelets model. Learning Time-Series Shapelets was originally presented in [1]_. Parameters ---------- n_shapelets_per_size: dict Dictionary giving, for each shapelet size (key), the number of such shapelets to be trained (value) max_iter: int (default: 1000) Number of training epochs. batch_size: int (default:256) Batch size to be used. verbose_level: {0, 1, 2} (default: 2) `keras` verbose level. optimizer: str or keras.optimizers.Optimizer (default: "sgd") `keras` optimizer to use for training. weight_regularizer: float or None (default: None) `keras` regularizer to use for training the classification (softmax) layer. If None, no regularization is performed. Attributes ---------- shapelets_: numpy.ndarray of objects, each object being a time series Set of time-series shapelets. shapelets_as_time_series_: numpy.ndarray of shape (n_shapelets, sz_shp, d) where \ sz_shp is the maximum of all shapelet sizes Set of time-series shapelets formatted as a ``tslearn`` time series dataset. Note ---- This implementation requires a dataset of equal-sized time series. Examples -------- >>> from tslearn.generators import random_walk_blobs >>> X, y = random_walk_blobs(n_ts_per_blob=20, sz=64, d=2, n_blobs=2) >>> clf = ShapeletModel(n_shapelets_per_size={10: 5}, max_iter=1, verbose_level=0) >>> clf.fit(X, y).shapelets_.shape (5,) >>> clf.shapelets_[0].shape (10, 2) >>> clf.predict(X).shape (40,) >>> clf.transform(X).shape (40, 5) >>> params = clf.get_params(deep=True) >>> sorted(params.keys()) ['batch_size', 'max_iter', 'n_shapelets_per_size', 'optimizer', 'verbose_level', 'weight_regularizer'] >>> clf.set_params(batch_size=128) # doctest: +NORMALIZE_WHITESPACE ShapeletModel(batch_size=128, max_iter=1, n_shapelets_per_size={10: 5}, optimizer='sgd', verbose_level=0, weight_regularizer=0.0) >>> clf2 = ShapeletModel(n_shapelets_per_size={10: 5, 20: 10}, max_iter=1, verbose_level=0) >>> clf2.fit(X, y).shapelets_.shape (15,) >>> clf2.shapelets_[0].shape (10, 2) >>> clf2.shapelets_[5].shape (20, 2) >>> clf2.shapelets_as_time_series_.shape (15, 20, 2) >>> clf2.predict(X).shape (40,) >>> clf2.transform(X).shape (40, 15) >>> clf2.locate(X).shape (40, 15) >>> import sklearn >>> cv_results = sklearn.model_selection.cross_validate(clf, X, y, return_train_score=False) >>> cv_results['test_score'].shape (3,) References ---------- .. [1] J. Grabocka et al. Learning Time-Series Shapelets. SIGKDD 2014. """ def __init__(self, n_shapelets_per_size, max_iter=1000, batch_size=256, verbose_level=2, optimizer="sgd", weight_regularizer=0.): self.n_shapelets_per_size = n_shapelets_per_size self.n_classes = None self.optimizer = optimizer self.max_iter = max_iter self.weight_regularizer = weight_regularizer self.model = None self.transformer_model = None self.locator_model = None self.batch_size = batch_size self.verbose_level = verbose_level self.categorical_y = False self.label_binarizer = None self.binary_problem = False self.d = None @property def _n_shapelet_sizes(self): return len(self.n_shapelets_per_size) @property def shapelets_(self): total_n_shp = sum(self.n_shapelets_per_size.values()) shapelets = numpy.empty((total_n_shp, ), dtype=object) idx = 0 for i, shp_sz in enumerate(sorted(self.n_shapelets_per_size.keys())): n_shp = self.n_shapelets_per_size[shp_sz] for idx_shp in range(idx, idx + n_shp): shapelets[idx_shp] = numpy.zeros((shp_sz, self.d)) for di in range(self.d): for inc, shp in enumerate( self.model.get_layer("shapelets_%d_%d" % (i, di)).get_weights()[0]): shapelets[idx + inc][:, di] = shp idx += n_shp assert idx == total_n_shp return shapelets @property def shapelets_as_time_series_(self): total_n_shp = sum(self.n_shapelets_per_size.values()) shp_sz = max(self.n_shapelets_per_size.keys()) non_formatted_shapelets = self.shapelets_ d = non_formatted_shapelets[0].shape[1] shapelets = numpy.zeros((total_n_shp, shp_sz, d)) + numpy.nan for i in range(self._n_shapelet_sizes): sz = non_formatted_shapelets[i].shape[0] shapelets[i, :sz, :] = non_formatted_shapelets[i] return shapelets def fit(self, X, y): """Learn time-series shapelets. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. y : array-like of shape=(n_ts, ) Time series labels. """ n_ts, sz, d = X.shape self.d = d if y.ndim == 1: self.label_binarizer = LabelBinarizer().fit(y) y_ = self.label_binarizer.transform(y) # if y_.shape[1] == 1: # y_ = numpy.hstack((y_, 1 - y_)) else: y_ = y self.categorical_y = True assert y_.shape[ 1] != 2, "Binary classification case, monodimensional y should be passed." if y_.ndim == 1: n_classes = 2 else: n_classes = y_.shape[1] self._set_model_layers(X=X, ts_sz=sz, d=d, n_classes=n_classes) self.model.compile( loss="categorical_crossentropy" if n_classes > 2 else "binary_crossentropy", optimizer=self.optimizer, metrics=[categorical_accuracy, categorical_crossentropy] if n_classes > 2 else [binary_accuracy, binary_crossentropy]) self.transformer_model.compile(loss="mean_squared_error", optimizer=self.optimizer) self.locator_model.compile(loss="mean_squared_error", optimizer=self.optimizer) self._set_weights_false_conv(d=d) self.model.fit([X[:, :, di].reshape((n_ts, sz, 1)) for di in range(d)], y_, batch_size=self.batch_size, epochs=self.max_iter, verbose=self.verbose_level) return self def predict(self, X): """Predict class probability for a given set of time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. Returns ------- array of shape=(n_ts, ) or (n_ts, n_classes), depending on the shape of the \ label vector provided at training time. Index of the cluster each sample belongs to or class probability matrix, depending on what was provided at training time. """ X_ = to_time_series_dataset(X) n_ts, sz, d = X_.shape categorical_preds = self.model.predict( [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)], batch_size=self.batch_size, verbose=self.verbose_level) if self.categorical_y: return categorical_preds else: if categorical_preds.shape[1] == 2: categorical_preds = categorical_preds[:, 0] return self.label_binarizer.inverse_transform(categorical_preds) def transform(self, X): """Generate shapelet transform for a set of time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. Returns ------- array of shape=(n_ts, n_shapelets) Shapelet-Transform of the provided time series. """ X_ = to_time_series_dataset(X) n_ts, sz, d = X_.shape pred = self.transformer_model.predict( [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)], batch_size=self.batch_size, verbose=self.verbose_level) return pred def locate(self, X): """Compute shapelet match location for a set of time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. Returns ------- array of shape=(n_ts, n_shapelets) Location of the shapelet matches for the provided time series. """ X_ = to_time_series_dataset(X) n_ts, sz, d = X_.shape locations = self.locator_model.predict( [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)], batch_size=self.batch_size, verbose=self.verbose_level) return locations.astype(numpy.int) def _set_weights_false_conv(self, d): shapelet_sizes = sorted(self.n_shapelets_per_size.keys()) for i, sz in enumerate(shapelet_sizes): for di in range(d): self.model.get_layer("false_conv_%d_%d" % (i, di)).set_weights( [numpy.eye(sz).reshape((sz, 1, sz))]) def _set_model_layers(self, X, ts_sz, d, n_classes): inputs = [ Input(shape=(ts_sz, 1), name="input_%d" % di) for di in range(d) ] shapelet_sizes = sorted(self.n_shapelets_per_size.keys()) pool_layers = [] pool_layers_locations = [] for i, sz in enumerate(sorted(shapelet_sizes)): transformer_layers = [ Conv1D(filters=sz, kernel_size=sz, trainable=False, use_bias=False, name="false_conv_%d_%d" % (i, di))(inputs[di]) for di in range(d) ] shapelet_layers = [ LocalSquaredDistanceLayer(self.n_shapelets_per_size[sz], X=X, name="shapelets_%d_%d" % (i, di))( transformer_layers[di]) for di in range(d) ] if d == 1: summed_shapelet_layer = shapelet_layers[0] else: summed_shapelet_layer = add(shapelet_layers) pool_layers.append( GlobalMinPooling1D(name="min_pooling_%d" % i)(summed_shapelet_layer)) pool_layers_locations.append( GlobalArgminPooling1D(name="min_pooling_%d" % i)(summed_shapelet_layer)) if len(shapelet_sizes) > 1: concatenated_features = concatenate(pool_layers) concatenated_locations = concatenate(pool_layers_locations) else: concatenated_features = pool_layers[0] concatenated_locations = pool_layers_locations[0] outputs = Dense(units=n_classes if n_classes > 2 else 1, activation="softmax" if n_classes > 2 else "sigmoid", kernel_regularizer=l2(self.weight_regularizer) if self.weight_regularizer > 0 else None, name="classification")(concatenated_features) self.model = Model(inputs=inputs, outputs=outputs) self.transformer_model = Model(inputs=inputs, outputs=concatenated_features) self.locator_model = Model(inputs=inputs, outputs=concatenated_locations) def get_weights(self, layer_name=None): """Return model weights (or weights for a given layer if `layer_name` is provided). Parameters ---------- layer_name: str or None (default: None) Name of the layer for which weights should be returned. If None, all model weights are returned. Available layer names with weights are: - "shapelets_i_j" with i an integer for the shapelet id and j an integer for the dimension - "classification" for the final classification layer Returns ------- list list of model (or layer) weights Examples -------- >>> from tslearn.generators import random_walk_blobs >>> X, y = random_walk_blobs(n_ts_per_blob=100, sz=256, d=1, n_blobs=3) >>> clf = ShapeletModel(n_shapelets_per_size={10: 5}, max_iter=0, verbose_level=0) >>> clf.fit(X, y).get_weights("classification")[0].shape (5, 3) """ if layer_name is None: return self.model.get_weights() else: return self.model.get_layer(layer_name).get_weights()
ap.add_argument("-m", "--model", required=True, help="path to output model") ap.add_argument("-o", "--output", required=True, help="path to output directory (logs, plots, etc.)") args = vars(ap.parse_args()) print("[INFO] loading CIFAR-10 data ...") ((train_x, train_y), (test_x, test_y)) = cifar10.load_data() train_x = train_x.astype("float") test_x = test_x.astype("float") mean = np.mean(train_x, axis=0) train_x -= mean test_x -= mean lb = LabelBinarizer() train_y = lb.fit_transform(train_y) test_y = lb.transform(test_y) aug = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest") fig_path = os.path.sep.join([args["output"], "{}.png".format(os.getpid())]) json_path = os.path.sep.join([args["output"], "{}.json".format(os.getpid())]) # callbacks = [TrainingMonitor(fig_path, json_path=json_path), LearningRateScheduler(poly_decay)] callbacks = [LearningRateScheduler(poly_decay)] print("[INFO] compiling model ...") opt = SGD(lr=INIT_LR, momentum=0.9) model = MiniGoogleNet.build(width=32, height=32, depth=3, classes=10) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) print("[INFO] training model ...") model.fit_generator(aug.flow(train_x, train_y, batch_size=64), validation_data=(test_x, test_y),
def do_vgg_train(path_input, width, height, basename, vgg_size, fc_size, logLevel="WARN"): """Train a VGG-like convolutional network """ logvgg = logging.getLogger(f"{__name__}.console.trainvgg") logvgg.setLevel(logLevel) model_file = f"{basename}.model" label_bin_file = f"{basename}.pickle" plot_file = f"{basename}.png" logvgg.debug(f"mf {model_file} lbf {label_bin_file} pf {plot_file}") data, labels = load_dataset(path_input, width, height, "INFO") # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25) # convert the labels from integers to vectors (for 2-class, binary # classification you should use Keras' to_categorical function # instead as the scikit-learn's LabelBinarizer will not return a # vector) lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.transform(testY) # construct the image generator for data augmentation # rotation is ok, shear/shift/flip reduced aug = ImageDataGenerator( rotation_range=30, width_shift_range=0.01, height_shift_range=0.01, shear_range=0.002, zoom_range=0.02, horizontal_flip=False, fill_mode="nearest", ) if vgg_size == "small": # TODO fc_size set from here model = SmallVGGNet.build(width=width, height=height, depth=3, classes=len(lb.classes_)) elif vgg_size == "middle": # default value of fc_size if fc_size == -1: fc_size = 512 model = MiddleVGGNet.build( width=width, height=height, depth=3, classes=len(lb.classes_), fully_connected_size=fc_size, ) else: logvgg.critical(f"Unrecognized dimension {vgg_size}, stopping.") return -1 # initialize our initial learning rate, # of epochs to train for, and batch size INIT_LR = 0.01 EPOCHS = 75 # EPOCHS = 3 BS = 32 # TODO fiddle with this # initialize the model and optimizer (you'll want to use # binary_crossentropy for 2-class classification) logvgg.info("Training network...") opt = SGD(lr=INIT_LR, decay=INIT_LR / EPOCHS) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # TODO fiddle with this # save model summary summary_file = f"{basename}_summary.txt" with open(summary_file, "w") as sf: model.summary(line_length=100, print_fn=lambda x: sf.write(f"{x}\n")) # using an actual logger: print_fn=logger.info # save the model structure in JSON format config = model.get_config() config_json_file = f"{basename}_structure.json" with open(config_json_file, "w") as jf: json.dump(config, jf) # train the network H = model.fit_generator( aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS, ) # save the model and label binarizer to disk logvgg.info("Serializing network and label binarizer...") model.save(model_file) with open(label_bin_file, "wb") as f: f.write(pickle.dumps(lb)) # evaluate the network logvgg.info("Evaluating network...") predictions = model.predict(testX, batch_size=32) report = classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=lb.classes_) logvgg.info(f"\n{report}") report_file = f"{basename}_report.txt" with open(report_file, "w") as rf: rf.write(report) # plot the training loss and accuracy N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["val_loss"], label="val_loss") plt.plot(N, H.history["acc"], label="train_acc") plt.plot(N, H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy (SmallVGGNet)") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.savefig(plot_file)
''' data = data.reshape(data.shape[1:]) data = data.transpose() ''' (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) # convert the labels from integers to vectors (for 2-class, binary # classification you should use Keras' to_categorical function # instead as the scikit-learn's LabelBinarizer will not return a # vector) lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.transform(testY) # define the 3072-1024-512-3 architecture using Keras model = Sequential() model.add(Dense(1024, input_shape=(3072, ), activation="sigmoid")) model.add(Dense(512, activation="sigmoid")) model.add(Dense(len(lb.classes_), activation="softmax")) # initialize our initial learning rate and # of epochs to train for INIT_LR = 0.01 EPOCHS = 75 # compile the model using SGD as our optimizer and categorical # cross-entropy loss (you'll want to use binary_crossentropy # for 2-class classification) print("[INFO] training network...")
argument_parser = argparse.ArgumentParser() argument_parser.add_argument('-w', '--weights', required=True, help='Path to best model weights file.') arguments = vars(argument_parser.parse_args()) print('[INFO] Loading CIFAR-10 data...') (X_train, y_train), (X_test, y_test) = cifar10.load_data() X_train = X_train.astype('float') / 255.0 X_test = X_test.astype('float') / 255.0 label_binarizer = LabelBinarizer() y_train = label_binarizer.fit_transform(y_train) y_test = label_binarizer.transform(y_test) print('[INFO] Compiling model...') optimizer = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True) model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) checkpoint = ModelCheckpoint(arguments['weights'], monitor='val_loss', save_best_only=True) callbacks = [checkpoint] print('[INFO] Training network...') H = model.fit(X_train,
def roc_multiclass_cruve_rf(y_test_class, y_pred_class): lb = LabelBinarizer() lb.fit(y_test_class) y_test_b = lb.transform(y_test_class) y_pred_b = lb.transform(y_pred_class) fpr = dict() tpr = dict() roc_auc = dict() fpr[0], tpr[0], _ = roc_curve(y_test_b[:, 0], y_pred_b[:, 0]) roc_auc[0] = auc(fpr[0], tpr[0]) # Compute micro fpr["micro"], tpr["micro"], _ = roc_curve(y_test_b.ravel(), y_pred_b.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) lw = 1 # First aggregate all false positive rates all_fpr = fpr[0] # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) mean_tpr += np.interp(all_fpr, fpr[0], tpr[0]) # Finally average it and compute AUC mean_tpr /= 3 fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) list = [0] colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(list, colors): plt.plot(fpr[0], tpr[0], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(0, roc_auc[0])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC For Random Forest') plt.legend(loc="lower right") plt.savefig('ROC For RF') return plt.show()
from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelBinarizer CSV_FILE_PATH = 'F://验证码识别/data.csv' # CSV 文件路径 df = pd.read_csv(CSV_FILE_PATH) # 读取CSV文件 # 数据集的特征 features = ['v' + str(i + 1) for i in range(16 * 20)] print(features.shape) raise TypeError() labels = df['label'].unique() # 对样本的真实标签进行标签二值化 lb = LabelBinarizer() lb.fit(labels) y_ture = pd.DataFrame(lb.transform(df['label']), columns=['y' + str(i) for i in range(31)]) y_bin_columns = list(y_ture.columns) for col in y_bin_columns: df[col] = y_ture[col] # 将数据集分为训练集和测试集,训练集70%, 测试集30% x_train, x_test, y_train, y_test = train_test_split(df[features], df[y_bin_columns], \ train_size = 0.7, test_size=0.3, random_state=123) # 构建RNN网络 # 模型保存地址 MODEL_SAVE_PATH = 'logs/RNN_train.ckpt' # RNN初始化 element_size = 16
df.columns = [x.lower().replace('.', '_') for x in df.columns] labels, uniques = pd.factorize(df.species) df['label'] = labels train, test = train_test_split(df, test_size=0.2) names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] all_features = np.hstack([column(df, i) for i in names]) train_features = np.hstack([column(train, i) for i in names]) test_features = np.hstack([column(test, i) for i in names]) lb = LabelBinarizer() lb.fit(df.label) all_labels = lb.transform(df.label).astype('float') train_labels = lb.transform(train.label).astype('float') test_labels = lb.transform(test.label).astype('float') # Softmax Regression k = lb.classes_.size x = tf.placeholder(tf.float32, [None, 4]) y = tf.placeholder(tf.float32, [None, 3]) w = tf.Variable(tf.truncated_normal([4, k], stddev=0.1)) b = tf.Variable(tf.truncated_normal([k], stddev=0.1)) y_ = tf.nn.softmax(tf.matmul(x, w) + b) # Loss Function
labels.append(label) # scale the raw pixel intensities to the range [0, 1] data = np.array(data, dtype="float") / 255.0 labels = np.array(labels) # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) # convert the labels from integers to vectors lb = LabelBinarizer().fit(trainY) trainY = lb.transform(trainY) testY = lb.transform(testY) # initialize the model print("[INFO] compiling model...") model = LeNet.build(width=28, height=28, depth=1, classes=9) opt = SGD(lr=0.01) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # train the network print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY),
def task1_2(df): print("======================== task 1_2 =============================") df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False) print(df_train.shape) del df['sentence_id'] del df['entity_id'] del df['entity_charOffset'] print(df_train.shape) text_train = df_train['sentence_text'].as_matrix() text_test = df_test['sentence_text'].as_matrix() sw = stopwords.words("english") vectorizer = TfidfVectorizer(lowercase=True, binary=True, stop_words=sw, sublinear_tf=True, norm=None) x_train = vectorizer.fit_transform(text_train).toarray() x_test = vectorizer.transform(text_test).toarray() entity_name_train = vectorizer.transform( df_train['entity_name'].as_matrix()).toarray() entity_name_test = vectorizer.transform( df_test['entity_name'].as_matrix()).toarray() x_train = np.concatenate((x_train, entity_name_train), axis=1) x_test = np.concatenate((x_test, entity_name_test), axis=1) print(df.head()) new_data = [] all_tags = [] for i in range(len(df_train)): text = nltk.word_tokenize(df_train['sentence_text'][i]) tagged_sent = nltk.pos_tag(text) found = False for t in tagged_sent: if (t[0].lower() in df_train['entity_name'][i].lower() and found == False): new_data.append(t[1]) all_tags.append(t[1]) found = True if (found == False): new_data.append('0') all_tags.append('0') dftrain_tag = pd.DataFrame(new_data, columns=['entity_tag']) #print(x_train.shape) #print(len(new_data)) new_datat = [] for i in range(10343, 10343 + len(df_test)): text = nltk.word_tokenize(df_test['sentence_text'][i]) tagged_sent = nltk.pos_tag(text) found = False for t in tagged_sent: if (t[0].lower() in df_test['entity_name'][i].lower() and found == False): new_datat.append(t[1]) all_tags.append(t[1]) found = True if (found == False): new_datat.append('0') all_tags.append('0') dftest_tag = pd.DataFrame(new_datat, columns=['entity_tag']) df_alltags = pd.DataFrame(all_tags, columns=['entity_tag']) #print(x_test.shape) #print(len(new_datat)) #print(dftrain_tag.head()) tags = df_alltags['entity_tag'].unique() tags_dict = dict(zip(tags, range(len(tags)))) dftrain_tag = dftrain_tag.replace(tags_dict) dftest_tag = dftest_tag.replace(tags_dict) #print(dftrain_tag.head()) x_train = np.concatenate((x_train, dftrain_tag), axis=1) x_test = np.concatenate((x_test, dftest_tag), axis=1) #print(x_train[0:6]) y_train = df_train['entity_type'].astype("category").cat.codes.as_matrix() y_test = df_test['entity_type'].astype("category").cat.codes.as_matrix() lb = LabelBinarizer() y_train = lb.fit_transform(y_train) y_test = lb.transform(y_test) pred = simple_nn(x_train, x_test, y_train) #pred = lb.inverse_transform(pred) y_train = lb.inverse_transform(y_train) y_test = lb.inverse_transform(y_test) pred_list = [pred] print(accuracy_score(pred, y_test)) print(f1_score(pred, y_test, average='macro')) lgr = LogisticRegression(C=0.05, class_weight='balanced') lgr.fit(x_train, y_train) pred1 = lgr.predict(x_test) pred_list.append(pred1) print(accuracy_score(pred1, y_test)) print(f1_score(pred1, y_test, average='macro')) svc = LinearSVC(C=0.0004, class_weight='balanced') svc.fit(x_train, y_train) pred2 = svc.predict(x_test) pred_list.append(pred2) print(accuracy_score(pred2, y_test)) print(f1_score(pred2, y_test, average='macro')) rfc = ensemble.RandomForestClassifier(class_weight='balanced') rfc.fit(x_train, y_train) pred3 = rfc.predict(x_test) pred_list.append(pred3) print(accuracy_score(pred3, y_test)) print(f1_score(pred3, y_test, average='macro')) #gb = ensemble.GradientBoostingClassifier() #gb.fit(x_train, y_train) #pred = gb.predict(x_test) #pred_list.append(pred) final_pred = [] for i in range(len(pred_list[0])): temp = [0, 0, 0, 0] for j in range(len(pred_list)): temp[pred_list[j][i]] += 1 final_pred.append(np.argmax(temp)) pred = final_pred print(pred) print(accuracy_score(pred, y_test)) print(f1_score(pred, y_test, average='macro')) preds = list(set(pred)) br = [] for j in range(len(preds)): br.append(0) for i in pred: for j in range(len(preds)): if i == preds[j]: br[j] += 1 print(br) preds = list(set(y_test)) br = [] for j in range(len(preds)): br.append(0) for i in y_test: for j in range(len(preds)): if i == preds[j]: br[j] += 1 print(br)
START_EPOCH = 50 # load the training and testing data, converting the images from # integers to floats print("[INFO] loading CIFAR-10 data...") ((x_train, y_train), (x_test, y_test)) = cifar10.load_data() x_train = x_train.astype("float") x_test = x_test.astype("float") # apply mean subtraction to the data mean = np.mean(x_train, axis=0) x_train -= mean x_test -= mean # convert the labels from integers to vectors lb = LabelBinarizer() y_train = lb.fit_transform(y_train) y_test = lb.transform(y_test) # construct the image generator for data augmentation aug = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest") # if there is no specific model checkpoint supplied, then initialize # the network (ResNet-56) and compile the model if MODEL_PATH: print("[INFO] loading {}...".format(MODEL_PATH)) model = load_model(MODEL_PATH) # update the learning rate print("[INFO] old learning rate: {}".format( K.get_value(model.optimizer.lr))) K.set_value(model.optimizer.lr, 1e-2) print("[INFO] new learning rate: {}".format( K.get_value(model.optimizer.lr)))
def multiclass_roc_auc_score(y_test, y_pred, average="weighted"): lb = LabelBinarizer() lb.fit(y_test) y_test = lb.transform(y_test) y_pred = lb.transform(y_pred) return metrics.roc_auc_score(y_test, y_pred, average=average)
train = 'train.csv' test = 'test.csv' #out = sys.argv[4] batch_size = 1000 lr = 0.01 activation = "sigmoid" # sigmoid or relu hidden_layers = [32,16] #no of units in each layer train_x, train_y = read_data(train) train_x = train_x / 255 test_y = train_y[train_y.shape[0]-10000:] # train_x = scale(train_x) lb = LabelBinarizer() lb.fit([i for i in range(10)]) #since 10 outputs are possible train_y = lb.transform(train_y) #original test dataset #test_x, test_y = read_data(test) #taking test split from train for validation test_x = train_x[train_y.shape[0]-10000:] #test_y = train_y[train_y.shape[0]-10000:] train_x = train_x[:-10000] train_y = train_y[:-10000] # In[76]: #(self, num_inputs, num_hidden_units_list, activation)
x_train, x_test, y_train_1000, y_test_1000 = train_test_split( data_1000['sequence'], data_1000['classification'], test_size=0.2, random_state=123) # In[34]: print(x_train.shape) print(x_test.shape) # In[35]: lb = LabelBinarizer() y_train = lb.fit_transform(y_train_1000) y_test = lb.transform(y_test_1000) print('number of classes %d' % y_train.shape[1]) # In[36]: def create_ngram_set(input_list, ngram_value=2): """ Extract a set of n-grams from a list of integers. >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) {(4, 9), (4, 1), (1, 4), (9, 4)} >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] """
class SNLI: def __init__(self, w2vec): self.data = {} self.data['X'] = {} self.data['y'] = {} self.data['X']['train'], self.data['y']['train'] = self.loadData( 'train') self.data['X']['test'], self.data['y']['test'] = self.loadData('test') self.data['X']['dev'], self.data['y']['dev'] = self.loadData('dev') self.le = LabelBinarizer() self.w2vec = w2vec self.le.fit(['entailment', 'neutral', 'contradiction']) def loadData(self, dataset, onlyGoldLabels=True, tokenize=True): """ onlyGoldLabels = True some sentences don't have final label, only have the 5 labels from annotators which don't agree. Ignores such sentences tokenize: splits sentences into tokens """ y = [] X = [] with open('../data/snli/snli_1.0_' + dataset + '.txt') as datafile: prev = None for line in datafile: if prev is None: prev = line continue parts = line.split("\t") if onlyGoldLabels: if parts[0] == '-': continue else: raise NotImplementedError y.append(parts[0]) X.append( [self.preprocess(parts[5]), self.preprocess(parts[6])]) return X, y def preprocess(self, sentence, removePunct=True, lowerCase=False): sentence = sentence.translate(None, string.punctuation) sentence = sentence.lower() return word_tokenize(sentence) def getMaxLengths(self): maxLen = [None, None] for ds in self.data['X']: for sent in self.data['X'][ds]: if maxLen[0] is None or len(sent[0]) > maxLen[0]: maxLen[0] = len(sent[0]) if maxLen[1] is None or len(sent[1]) > maxLen[1]: maxLen[1] = len(sent[1]) return maxLen def getX(self, dataset, start_index, end_index): premise = [] hypothesis = [] sentences = self.data['X'][dataset][start_index:end_index] for pair in sentences: prem = [] for w in pair[0]: try: toappend = self.w2vec.convertWord(w) except KeyError: toappend = self.w2vec.unkWordRep() prem.append(toappend) premise.append(np.asarray(prem)) hyp = [] for w in pair[1]: try: toappend = self.w2vec.convertWord(w) except KeyError: toappend = self.w2vec.unkWordRep() hyp.append(toappend) hypothesis.append(np.asarray(hyp)) rval = np.asarray(premise), np.asarray(hypothesis) return rval def getY(self, dataset, start_index=None, end_index=None): #converts label to 0,1,2 if start_index is not None and end_index is not None: return self.le.transform( self.data['y'][dataset][start_index:end_index]) else: return self.le.transform(self.data['y'][dataset]) def getData(self, dataset): return self.getX(dataset), self.getY(dataset)
traindata=np.asarray(traindata) trainlabel=np.asarray(trainlabel) testdata=np.asarray(testdata) testlabel=np.asarray(testlabel) testdata = testdata.astype("float32") traindata = traindata.astype("float32") testlabel=np.asarray(testlabel) print('Training data shape : ', traindata.shape, trainlabel.shape) print('Testing data shape : ', testdata.shape, testlabel.shape) from sklearn.preprocessing import LabelBinarizer lblbin = LabelBinarizer() train_labels_onehot = lblbin.fit_transform(trainlabel) test_labels_onehot = lblbin.transform(testlabel) detail(train_labels_onehot) from tensorflow.keras.models import Sequential from tensorflow.keras.layers import * model = Sequential() # first convolution: CONV => RELU => POOL model.add(Input(shape=(3,))) model.add(Dense(3)) model.add(Activation("softmax")) model.add(Dense(3)) model.add(Activation("relu")) model.add(Dense(2)) model.add(Dense(3)) model.add(Activation("softmax")) model.add(Dense(2))
class ActualTreatmentPredictor(_BasePredictor): """Returns the most likely treatments for a patient. Args: prediction_model: The model used to make predictions preprocessor: The preprocessor used to transform the patient features into a format that can be used by the prediction_model recommendation_probability_threshold: The probability threshold that a potential recommendation needs to have a higher probability than to be considered a possible treatment. """ def __init__(self, prediction_model, preprocessor, recommendation_probability_threshold=0.05): super().__init__(prediction_model, preprocessor) self._treatment_label_binarizer = LabelBinarizer() self._recommendation_probability_threshold = recommendation_probability_threshold def _pre_fit_hook(self, data): self._treatment_label_binarizer.fit(data.treatment.unique()) def _get_outcome_data_for_training(self, data): return self._treatment_label_binarizer.transform(data.treatment.values) def _get_predicted_value(self, prediction): return self._treatment_label_binarizer.inverse_transform(prediction) def get_possible_treatments(self, data): """Returns the most likely treatments for a patient. Args: data: A dataframe containing patient features as well as a sample_id column. The sample_id column is needed because there can be many most likely treatments for a sample_id and the column is used to reconcile the treatment with the record. Returns: A dataframe with the following columns: sample_id: The sample_id the treatment is for. treatment: The treatment category """ self._checked_is_trained() # leave comment on structure probabilities_sectioned_by_treatment = self._pipeline.predict_proba( data) ordered_treatments = self._treatment_label_binarizer.classes_ treatment_dfs = [] for (treatment, probabilities_for_treatment) in zip( ordered_treatments, probabilities_sectioned_by_treatment): probability_of_treatment = [ prob[1] if len(prob) > 1 else 0 for prob in probabilities_for_treatment ] df = pd.DataFrame({ "treatment": treatment, "probability_of_treatment": probability_of_treatment, "sample_id": range(len(probability_of_treatment)) }) treatment_dfs.append(df) combined_df = pd.concat(treatment_dfs) # Get all treatments that have a probability greater than the threshold sample_with_high_probability = \ combined_df[combined_df.probability_of_treatment > self._recommendation_probability_threshold] # Get the top probability for a sample_id. This treatment will be used if there is no treatment for the # sample_id greater than the threshold top_treatment_per_sample_id = combined_df.groupby( "sample_id")["probability_of_treatment"].nlargest( 1).reset_index().drop('level_1', axis=1) # Find top treatments for samples that have not treatment above the threshold. This is a rare case but can # happen. samples_ids_with_high_prob = set( sample_with_high_probability.sample_id.unique()) all_sample_ids = set(combined_df.sample_id.unique()) ids_not_in_high_prob = all_sample_ids - samples_ids_with_high_prob top_treatments_for_samples_missing_high_prob =\ top_treatment_per_sample_id[top_treatment_per_sample_id.sample_id.isin(ids_not_in_high_prob)] return pd.concat([ sample_with_high_probability, top_treatments_for_samples_missing_high_prob ])
def zad3(y_test, y_pred, average): lb = LabelBinarizer() lb.fit(y_test) y_test = lb.transform(y_test) y_pred = lb.transform(y_pred) return roc_auc_score(y_test, y_pred, average=average)
def multiclass_roc_auc_score(y_true, y_pred, average='macro'): lb = LabelBinarizer() lb.fit(y_true) y_true = lb.transform(y_true) y_pred = lb.transform(y_pred) return roc_auc_score(y_true, y_pred, average="weighted")
# save label_maker print('Saving object to convert output to labels ' + OUT_FOLDER + '/label_maker.' + MODEL_NAME + '.pickle') with open(OUT_FOLDER + '/label_maker.' + MODEL_NAME + '.pickle', 'wb') as handle: pickle.dump(label_maker, handle, protocol=pickle.HIGHEST_PROTOCOL) labels = list(np.repeat('Coronaviridae',len(Coronaviridae_reads))) + \ list(np.repeat('Influenza',len(Influenza_reads))) + \ list(np.repeat('Metapneumovirus',len(Metapneumovirus_reads))) + \ list(np.repeat('Rhinovirus',len(Rhinovirus_reads))) + \ list(np.repeat('Sars_cov_2',len(Sars_cov_2_reads))) + \ list(np.repeat('Human',len(Human))) labels_proces = label_maker.transform(labels) # Tokenize the vocabulary tokenizer = Tokenizer() tokenizer.fit_on_texts(total_sequences) print('Converting reads into k-mers of lenght ' + str(K_MERS)) sequences_preproces = tokenizer.texts_to_sequences(total_sequences) max_length = max([len(s.split()) for s in total_sequences]) # pad sequences sequences_preproces = pad_sequences(sequences_preproces, maxlen=max_length, padding='post') print('Saving tokenizer object ' + OUT_FOLDER + '/tokenizer.' + MODEL_NAME + '.pickle')
data.append(image) labels.append(label) # scale the raw pixel intensities to the range [0, 1] (this improves training) data = np.array(data, dtype="float") / 255.0 labels = np.array(labels) # Split the training data into separate train and test sets (X_train, X_test, Y_train, Y_test) = train_test_split(data, labels, test_size=0.25, random_state=0) # Convert the labels (letters) into one-hot encodings that Keras can work with lb = LabelBinarizer().fit(Y_train) Y_train = lb.transform(Y_train) Y_test = lb.transform(Y_test) # Save the mapping from labels to one-hot encodings. # We'll need this later when we use the model to decode what it's predictions mean with open(MODEL_LABELS_FILENAME, "wb") as f: pickle.dump(lb, f) # Build the neural network! model = Sequential() # First convolutional layer with max pooling model.add( Conv2D(20, (5, 5), padding="same", input_shape=(20, 20, 1),