class TfIdf(Feature): def __init__(self): self.kbest = None self.vect = None self.truncated = None self.normalizer = None def train(self, reviews, labels): self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english') reviews_text = [ ' '.join(list(chain.from_iterable(review))) for review in reviews ] tfidf_matrix = self.vect.fit_transform(reviews_text).toarray() self.truncated = TruncatedSVD(n_components=50) self.truncated.fit(tfidf_matrix, labels) trunc = self.truncated.transform(tfidf_matrix) self.normalizer = Normalizer() self.normalizer.fit(trunc) self.kbest = SelectKBest(f_classif, k=5) self.kbest.fit(self.normalizer.transform(trunc), labels) def score(self, data): reviews_text = ' '.join(list(chain.from_iterable(data))) tfidf_matrix = self.vect.transform([reviews_text]).toarray() trunc = self.truncated.transform(tfidf_matrix) return tuple( self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def dataloader(datadir, dataload): data = pd.read_csv(os.path.join(datadir, f'LUADLUSC_float32.tsv'), sep='\t') data_copy = data.copy() data_copy.set_index('sample', inplace=True) # dataload시에 전체데이터 로그변환 -> 전체데이터 scaling if dataload == 1: X = data_copy.iloc[:, :-1] y_target = data_copy.iloc[:, -1] X_columns = X.columns X_index = X.index # log 변환 X = X.apply(np.log1p) # Standard scaling scaler = Normalizer() scaler.fit(X) X = scaler.transform(X) X = pd.DataFrame(X, columns=X_columns, index=X_index) elif dataload == 2: X = data_copy.iloc[:, :-1] y_target = data_copy.iloc[:, -1] elif dataload == 3: # c4 computational gene set 사용해서 feature 개수 줄이는 방법 data_gene = pd.read_csv(os.path.join(datadir, f'c4_entrez.gmt.txt'), sep='\t', engine='python', header=None, index_col=0, error_bad_lines=False) data_gene.drop(1, axis=1, inplace=True) data_gene.fillna(0, inplace=True) gene_idx = [] for idx in data_gene.index: gene = data_gene.loc[idx].values.astype(int) gene_idx.append(gene) idx = [] for i in range(len(gene_idx)): for j in gene_idx[i]: idx.append(j) set_idx = set(idx) lst_idx = list(set_idx) # new feature dataframe 생성 overlap_idx = [ ] # computational dataset에 있으면서 기존 데이터에도 있는 유전자의 entrez번호 for col in data_copy.columns[:-1]: if int(col.split('|')[0].split(':')[1]) in lst_idx: overlap_idx.append(col) data_copy2 = data_copy.loc[:, overlap_idx] # 새롭게 만든 데이터 프레임 data_copy2['target,LUAD:0,LUSC:1'] = data_copy['target,LUAD:0,LUSC:1'] X = data_copy.iloc[:, :-1] y_target = data_copy.iloc[:, -1] return X, y_target
class TfIdf(Feature): def __init__(self): self.kbest = None self.vect = None self.truncated = None self.normalizer = None def train(self, reviews, labels): self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english') reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews] tfidf_matrix = self.vect.fit_transform(reviews_text).toarray() self.truncated = TruncatedSVD(n_components=50) self.truncated.fit(tfidf_matrix, labels) trunc = self.truncated.transform(tfidf_matrix) self.normalizer = Normalizer() self.normalizer.fit(trunc) self.kbest = SelectKBest(f_classif, k=5) self.kbest.fit(self.normalizer.transform(trunc), labels) def score(self, data): reviews_text = ' '.join(list(chain.from_iterable(data))) tfidf_matrix = self.vect.transform([reviews_text]).toarray() trunc = self.truncated.transform(tfidf_matrix) return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def preproc(X_train, X_test, y_train, y_test, specified_scaler, feature_selection, feature_limit=21): # Scale feature values if specified_scaler == "none": scaler = Normalizer() if specified_scaler == "standard": scaler = StandardScaler() # -- works well when outliers are negligible # scaler = MinMaxScaler(feature_range = (0, 10)) # -- works well when outliers are negligible # Robust ignores (no pruned) small and large outliers, given a percentile and scales rest of data if specified_scaler == "robust": scaler = RobustScaler(quantile_range=(25, 75)) # quantile transformers changes the distribution and even makes outliers part of inliers-- good for uniform data if specified_scaler == "quantile": scaler = QuantileTransformer( output_distribution='uniform') # fast and great # PowerTransformer finds the optimal scaling factor to stabilize variance through maximum likelihood estimation if specified_scaler == "power": scaler = PowerTransformer(method='yeo-johnson') # great # Fit on training set only scaler.fit(X_train[feature_selection[:feature_limit]]) # Apply transform to both the training set and the test set and standard naming conventions X_train = scaler.transform(X_train[feature_selection[:feature_limit]]) X_test = scaler.transform(X_test[feature_selection[:feature_limit]]) return X_train, X_test, y_train, y_test
def _test_normalizer_converter(self, norm): warnings.filterwarnings("ignore") X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32) # Create SKL model for testing model = Normalizer(norm=norm) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn(model, initial_types=[ ("float_input", FloatTensorType_onnx(X.shape)) ]) # Create ONNX model by calling converter onnx_model = convert(onnx_ml_model, "onnx", X) # Get the predictions for the ONNX-ML model session = ort.InferenceSession(onnx_ml_model.SerializeToString()) output_names = [ session.get_outputs()[i].name for i in range(len(session.get_outputs())) ] onnx_ml_pred = [[] for i in range(len(output_names))] inputs = {session.get_inputs()[0].name: X} onnx_ml_pred = session.run(output_names, inputs) # Get the predictions for the ONNX model session = ort.InferenceSession(onnx_model.SerializeToString()) onnx_pred = [[] for i in range(len(output_names))] onnx_pred = session.run(output_names, inputs) return onnx_ml_pred, onnx_pred
def face_classification(): data = load('attendance/model/sample_data_face_embeddings.npz') trainX, trainy, testX, testy = data['arr_0'], data['arr_1'], data[ 'arr_2'], data['arr_3'] print("Dataset: train %d,test=%d" % (trainX.shape[0], testX.shape[0])) in_encoder = Normalizer(norm='l2') in_encoder.fit(trainX) trainX = in_encoder.transform(trainX) testX = in_encoder.transform(testX) out_encoder = LabelEncoder() out_encoder.fit(trainy) save_pickle('attendance/model/label_encoder.pkl', out_encoder) trainy = out_encoder.transform(trainy) testy = out_encoder.transform(testy) model = SVC(kernel="linear", probability=True) model.fit(trainX, trainy) yhat_train = model.predict(trainX) yhat_test = model.predict(testX) #saving the model in pkl file save_pickle('attendance/model/svm_model.pkl', model) score_train = accuracy_score(trainy, yhat_train) score_test = accuracy_score(testy, yhat_test) print('Accuracy: train=%.3f, test=%.3f' % (score_train * 100, score_test * 100)) print("Training is done!!!")
def normalize_select(normalize, x_train=None, x_val=None): x_train_index = x_train.index x_val_index = x_val.index if normalize == "Normalizer": scaler = Normalizer() scaler.fit(x_train) x_train_new = scaler.transform(x_train) x_val_new = scaler.transform(x_val) x_train = pd.DataFrame(x_train_new, columns=x_train.columns, index=x_train_index) x_val = pd.DataFrame(x_val_new, columns=x_val.columns, index=x_val_index) if normalize == "Minmax": scaler = MinMaxScaler() scaler.fit(x_train) x_train_new = scaler.transform(x_train) x_val_new = scaler.transform(x_val) x_train = pd.DataFrame(x_train_new, columns=x_train.columns, index=x_train_index) x_val = pd.DataFrame(x_val_new, columns=x_val.columns, index=x_val_index) return x_train, x_val
def display_plot(csv, t_size, max_neigh): gt = pd.read_csv(csv) cols = [col for col in gt.columns if col not in ['label']] data = gt[cols] target = gt['label'] data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=t_size, random_state=0) scaler = Normalizer() scaler.fit(data_train) data_train = scaler.transform(data_train) data_test = scaler.transform(data_test) training_accuracy = [] test_accuracy = [] neighbors_settings = range(1, max_neigh) for n_neighbors in neighbors_settings: clf = KNeighborsClassifier(n_neighbors=n_neighbors) clf.fit(data_train, target_train) training_accuracy.append(clf.score(data_train, target_train)) test_accuracy.append(clf.score(data_test, target_test)) plt.plot(neighbors_settings, training_accuracy, label="training accuracy") plt.plot(neighbors_settings, test_accuracy, label="test accuracy") plt.ylabel("Accuracy") plt.xlabel("k") plt.legend()
class NormalizerPrim(primitive): def __init__(self, random_state=0): super(NormalizerPrim, self).__init__(name='Normalizer') self.id = 13 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1 or l2) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of a copy / conversion). Scaling inputs to unit norms is a common operation for text classification or clustering for instance. For instance the dot product of two l2-normalized TF-IDF vectors is the cosine similarity of the vectors and is the base similarity metric for the Vector Space Model commonly used by the Information Retrieval community." self.hyperparams_run = {'default': True} self.scaler = Normalizer() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) cols = ["{}_nrmlzr".format(x) for x in cols] output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
class SimpleNormalizer(Transormer): def __init__(self, env): super().__init__() self._env = env self._norm = Normalizer() self._norm.fit(self._gen_data(10000)) def _gen_data(self, cap): data = [] while len(data) < cap: data.append(self._env.reset()) done = False while not done: action = self._env.action_space.sample() s, _, done, _ = self._env.step(action) data.append(s) return data def transform(self, state): res = self._norm.transform([state])[0] return res def save(self, path): fp = os.path.join(path, 'norm.pkl') with open(fp, 'wb') as f: pickle.dump(self._norm, f) def load(self, path): fp = os.path.join(path, 'norm.pkl') with open(fp, 'rb') as f: self._norm = pickle.load(f)
def data_preprocess(data): # your code here # example: label = LabelEncoder() label_count = 0 for col in data: if data[col].dtype == 'object': if len(list(data[col].unique())) <= 2: # Train on data label.fit(data[col]) # Transform data data[col] = label.transform(data[col]) label_count += 1 x = pd.get_dummies(data) scaler = Normalizer() imputer = Imputer(strategy = 'median') imputer.fit(x) x = imputer.transform(x) scaler.fit(x) x = scaler.transform(x) # your code end return x
class CatEncoder: def __init__(self, cat_columns, data, normalize: bool = True): self.cat_indexes = [data.columns.get_loc(name) for name in cat_columns] self.num_indexes = [ idx for idx in range(len(data.columns)) if idx not in self.cat_indexes ] self.encoder = preprocessing.OneHotEncoder() self.encoder.fit(data[cat_columns]) self.num_columns = list(data.columns[self.num_indexes]) self.cat_columns = cat_columns cat_transformed_names = self.encoder.get_feature_names( input_features=self.cat_columns) self._transformed_column_names = self.num_columns + list( cat_transformed_names) if normalize: self.normalizer = Normalizer() self.normalizer.fit(data.iloc[:, self.num_indexes]) else: self.normalizer = None def __call__(self, x): numeric = x[:, self.num_indexes] if self.normalizer is not None: numeric = self.normalizer.transform(numeric) categorical = self.encoder.transform(x[:, self.cat_indexes]).toarray() return np.concatenate((numeric, categorical), axis=1) @property def transformed_features(self): return self._transformed_column_names
def run_grid_search(config, grid_search, feature_vectors, labels, classifier_model='svm', scale=True, normalize=False): if (config.has("model")): classifier_model = config.get("model") scale = config.get("scale") normalize = config.get("normalize") print classifier_model print 'Scale:', print scale print 'Normalize:', print normalize if (scale): scaler = StandardScaler() scaler.fit(feature_vectors) feature_vectors = scaler.transform(feature_vectors) if (normalize): normalizer = Normalizer() normalizer.fit(feature_vectors) feature_vectors = normalizer.transform(feature_vectors) if classifier_model == 'svm': grid_search.fit(feature_vectors, labels)
def Normalizer(self): esti = Normalizer() esti.fit(self.feature_data) new_data = Normalizer().fit_transform(self.feature_data) # print(new_data.shape) # print(new_data) return new_data
def reduce_dim(data, labels, n_components, **kwargs): ''' performs dimensionality reduction''' if kwargs['method'] == 'pca': matrix = data #transformer = Normalizer() #transformer.fit(matrix) pca = PCA(n_components=n_components, svd_solver='full') pca.fit(matrix) #return pca.fit_transform(matrix) #pass return pca.transform(matrix) if kwargs['method'] == 'lda': transformer = Normalizer() label = labels matrix = data transformer.fit(matrix) lda = LDA(n_components=n_components) lda.fit(transformer.transform(matrix), label) return lda.transform(matrix) #pass if kwargs['method'] == 'ica': matrix = data ica = ICA(n_components=n_components, random_state=0) return ica.fit_transform(matrix)
def generate_attendence(entities, relations, attributes, param): # initialize lecture attendence completely randomly attend = dict() for s in relations["s_c"]: for c in relations["s_c"][s]: attend[(s, c)] = random.choice([0, 1]) # Gibbs sampling for t in progressbar.progressbar(range(param.time)): print(f"\n {np.mean(np.fromiter(attend.values(), dtype=float)) * 100}") for s in relations["s_c"]: for c in relations["s_c"][s]: prev_attend = 0.3 * attend[(s, c)] phi_skill = 0 friends_attend = summarize.beta_9 * friends_attendence(s, c, relations, attend) diff = attributes["d"][c] noise = normal() attend[(s, c)] = prev_attend + phi_skill + friends_attend + diff + noise all_values = np.fromiter(attend.values(), dtype=float).reshape(-1, 1) normalizer = Normalizer() normalizer.fit(all_values) for k in attend: v = np.array(attend[k]).reshape(-1, 1) attend[k] = random.random()<expit(normalizer.transform(v))
def normalize_data(X_train, X_test): # remove overlap cut = int(X_train.shape[1] / 2) longX = X_train[:, -cut:, :] # flatten windows longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2])) # flatten train and test flatX_train = X_train.reshape( (X_train.shape[0] * X_train.shape[1], X_train.shape[2])) flatX_test = X_test.reshape( (X_test.shape[0] * X_test.shape[1], X_test.shape[2])) # normalize and fit on training data s = Normalizer() # fit on training data s.fit(longX) # apply to training and test data longX = s.transform(longX) flatX_train = s.transform(flatX_train) flatX_test = s.transform(flatX_test) # reshape flatX_train = flatX_train.reshape((X_train.shape)) flatX_test = flatX_test.reshape((X_test.shape)) return flatX_train, flatX_test
def prepare_X(X_raw, X_full_raw): X_before = np.array([e.ravel() for e in X_raw]) X_full_before = np.array([e.ravel() for e in X_full_raw]) n = Normalizer() n.fit(X_full_before) result = np.array([np.split(e, 100) for e in n.transform(X_before)]) return result
def normalization(os_list, X_test): X_test_scaled = [] for i in range(len(os_list)): sc = Normalizer(norm='l2') sc.fit(os_list[i][0]) os_list[i][0] = sc.transform(os_list[i][0]) X_test_scaled.append(sc.transform(X_test)) return os_list, X_test_scaled
def test_normalizer(): for norm in ["l1", "l2", "max"]: tform = Normalizer(norm=norm) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X) np.allclose(X_t, X_t_)
def normalize_data(x_train, x_test): normalizer = Normalizer() # Fit our normalizer to our training data. normalizer.fit(x_train) # Transform the training data using our fitted normalizer. x_train = normalizer.transform(x_train) # Transform the testing data using our x_trained fitted normalizer. x_test = normalizer.transform(x_test) return x_train, x_test
class ScikitNormalizer(object): def __init__(self): self.data_normalizer = Normalizer() def fit(self, data): self.data_normalizer.fit(data) def transform(self, data): return (self.data_normalizer.transform(data) + 1) / 2
def test_normalizer_sparse(): X_sparse = tosparse(X) for norm in ["l1", "l2", "max"]: tform = Normalizer(norm=norm) tform.fit(X) tform_ = convert_estimator(tform) X_t = tform.transform(X) X_t_ = tform_.transform(X_sparse) np.allclose(X_t, X_t_.todense())
class ScikitNormalizer(object): def __init__(self): self.data_normalizer = Normalizer() def fit(self, data): self.data_normalizer.fit(data) def transform(self, data): return (self.data_normalizer.transform(data) + 1) / 2
def train_model(X_train, y_train): normalizer = Normalizer() normalizer.fit(X_train) normalized_data = normalizer.transform(X_train) ada_boost_classifier = AdaBoostClassifier() ada_boost_classifier.fit(normalized_data, y_train) return ada_boost_classifier, normalizer
def pca(x_train, x_test): normalizer = Normalizer() normalizer.fit(x_train) x_train = normalizer.transform(x_train) x_test = normalizer.transform(x_test) pca = PCA() pca.fit(x_train) x_train = pca.transform(x_train) x_test = pca.transform(x_test)
def normalize_data(dataframe): """ :param dataframe: DataFrame :return: DataFrane of normalized data """ normal = Normalizer() normal.fit(dataframe) data = normal.transform(dataframe) data = pd.DataFrame(data, columns=list(dataframe)) return data
def test_model_normalizer(self): model = Normalizer(norm="l2") x = numpy.random.randn(10, 1).astype(numpy.int64) model.fit(x) model_onnx = convert_sklearn(model, "scikit-learn normalizer", [("input", Int64TensorType([None, 1]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) self.assertTrue(len(model_onnx.graph.node) == 1)
def normalize_usecase(): X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]) normalize = Normalizer(norm="l1") normalize.fit(X_train) X_L1_normalize = normalize.transform(X_train) print(X_L1_normalize) normalize = Normalizer(norm="l2") normalize.fit(X_train) X_L2_normalize = normalize.transform(X_train) print(X_L2_normalize)
class RepresentationNormal(): def __init__(self, norm=DEFAULT_NORM): self.norm = norm self.normalizer = Normalizer(norm=self.norm) def fit(self, data): self.normalizer.fit(data) def transform(self, data): return self.normalizer.transform(data)
def test_onnx_normalizer_converter_raises_rt(self): warnings.filterwarnings("ignore") X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32) model = Normalizer(norm="l1") model.fit(X) # generate test input onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))]) onnx_ml_model.graph.node[0].attribute[0].s = "".encode() self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
class KNN(Model): def __init__(self, X_train, y_train, X_val, y_val): super().__init__() self.normalizer = Normalizer() self.normalizer.fit(X_train) self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1) self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train)) print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val)) def guess(self, feature): return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
class KNN(Model): def __init__(self, X_train, y_train, X_val, y_val): super().__init__() self.normalizer = Normalizer() self.normalizer.fit(X_train) self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1) self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train)) print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val)) def guess(self, feature): return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))
def test_normalizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Normalizer # with sklearn.preprocessing.Normalizer normalizerr = NormalizerR() normalizerr.fit(np.concatenate(trajs)) normalizer = Normalizer() normalizer.fit(trajs) y_ref1 = normalizerr.transform(trajs[0]) y1 = normalizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def test_sklearn_transform(): transformer = Normalizer() transformer.fit(X_train) computation = SklearnTransform("test-sklearn", transformer, istreams=[], ostream="out") context = ComputationContext(computation) data = pd.DataFrame(X_test).to_json(orient="records") computation.process_record(context, Record("transform", data, None)) assert len(context.records) == 1 assert len(context.records["out"]) == 1 record = context.records["out"][0] assert record.key == "transform" assert np.allclose(transformer.transform(X_test), json.loads(record.data))
data_np[:, [5]] = le.transform(np.ravel(data_np[:, [5]])).reshape(n_lin, 1) # Encode label in the columns 11 le2 = LabelEncoder() le2.fit(np.ravel(data_np[:, [10]])) #print le2.classes_ data_np[:, [10]] = le2.transform(np.ravel(data_np[:, [10]])).reshape(n_lin, 1) # Replace missing values by 0 for the column 16 and 17 data_np = preprocess_replace_NaN(data_np, [15, 16], 'nan') # plot_NA_ratio_features(data_np, feature_names) # Normalize the dataset for columns 5, 6, 7, 10, 11, 13, 14, 17 and 25 nor = Normalizer( norm='l1') nor.fit(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64)) # [0, 1, 2, 6, 11, 17, 18, 19, 20, 21, 22, 23] data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]] = \ nor.transform(data_np[:, [4, 5, 6, 9, 10, 12, 13, 16, 24]].astype(np.float64)) # Replace missing values for the risk_factor using a svm classifier preprocess_missing_risk_factor(data_np) # plot_pourcentage_result(data_np, feature_names, [17, 18, 19, 20, 21 ,22, 23]) # plot_NA_ratio_features(data_np, feature_names) ################################################################################ # # Replace all missing values for the column 12, 16 and 17 with the median value
for C in np.arange(0.05,2, 0.05): for gamma in np.arange(0.001, 0.1, 0.001): svc = SVC(C=C,gamma=gamma) svc.fit(X_train, y_train) score = svc.score(X_test, y_test) if score > best_score: best_score = score print "C, gamma, score", C, gamma, score #normalizer norm = Normalizer() norm.fit(X) T = norm.transform(X) X_train, X_test, y_train, y_test = train_test_split(T, y, test_size=0.3, random_state=7) for C in np.arange(0.05,2, 0.05): for gamma in np.arange(0.001, 0.1, 0.001): svc = SVC(C=C,gamma=gamma) svc.fit(X_train, y_train) score = svc.score(X_test, y_test) if score > best_score: best_score = score print "C, gamma, score", C, gamma, score #maxabs
from sklearn import cross_validation from sklearn import svm from sklearn import metrics from sklearn.preprocessing import StandardScaler, Normalizer from sklearn.externals import joblib from grid_search import grid_estimation # downloading matrix of text features and assigned clusters all_data = genfromtxt('features_and_clusters.csv', delimiter=',') data = all_data[:, 0:29] target = all_data[:, 29] # normalization and scaling of data normalizer = Normalizer() normalizer.fit(data) data = normalizer.transform(data) scaler = StandardScaler() data = scaler.fit_transform(data) # choosing of training and test sets X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.4, random_state=0) #clf = svm.SVC(kernel="rbf", gamma=0.001, C=1000).fit(X_train, y_train) clf = svm.SVC(kernel="linear", gamma=1.0, C=1).fit(X_train, y_train) # saving of classifier, scaler and normalizer joblib.dump(clf, 'classifier_data\\model.pkl') joblib.dump(scaler, 'classifier_data\\scaler.pkl') joblib.dump(normalizer, 'classifier_data\\normalizer.pkl')
# Append new features newAct_train = np.zeros((activation_train.shape[0], activation_train.shape[1]+3)) for i in range(activation_train.shape[0]): newAct_train[i] = np.append(activation_train[i], pttImg_sample_train[i][:3]) newAct_valid = np.zeros((activation_valid.shape[0], activation_valid.shape[1]+3)) for i in range(activation_valid.shape[0]): newAct_valid[i] = np.append(activation_valid[i], valid_pttImg[i][:3]) newAct_test = np.zeros((activation_test.shape[0], activation_test.shape[1]+3)) for i in range(activation_test.shape[0]): newAct_test[i] = np.append(activation_test[i], test_blogImg[i][:3]) # Normalize normalizer = Normalizer() normalizer.fit(newAct_train) newAct_train = normalizer.transform(newAct_train) newAct_valid = normalizer.transform(newAct_valid) newAct_test = normalizer.transform(newAct_test) # Final model model3 = Sequential() model3.add(Dense(2, input_shape=(newAct_train.shape[1],), activation='softmax')) adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model3.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) print(model3.summary()) model3.fit(newAct_train, y_train_sample, epochs=epochs3, batch_size=batch_size) # Evaluating by using validation data or testing data print("Valid:") scores = model3.evaluate(newAct_valid, y_valid_sample, verbose=0)
def main(): paths = ['C:/Data/crowdflower/train.csv', 'C:/Data/crowdflower/test.csv'] t = p.read_csv(paths[0]) t2 = p.read_csv(paths[1]) class LancasterTokenizer(object): def __init__(self): self.wnl = nltk.stem.LancasterStemmer() def __call__(self, doc): return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)] class PorterTokenizer(object): def __init__(self): self.wnl = nltk.stem.PorterStemmer() def __call__(self, doc): return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)] class WordnetTokenizer(object): def __init__(self): self.wnl = nltk.stem.WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in wordpunct_tokenize(doc)] class SnowballTokenizer(object): def __init__(self): self.wnl = nltk.stem.SnowballStemmer("english") def __call__(self, doc): return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)] tfidf1 = TfidfVectorizer(max_features=85000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',sublinear_tf=1, ngram_range=(1, 2),tokenizer = SnowballTokenizer()) tfidf2 = TfidfVectorizer(max_features=600000, strip_accents='unicode', analyzer='char',sublinear_tf=1, ngram_range=(2, 17)) tfidf3 = CountVectorizer(max_features=5200, strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}', ngram_range=(1, 3),tokenizer = SnowballTokenizer()) tfidf4 = CountVectorizer(max_features=1800, strip_accents='unicode', analyzer='char', ngram_range=(2, 9)) tfidf5 = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='char_wb',sublinear_tf=1, ngram_range=(2, 9)) tfidf6 = CountVectorizer(max_features=1800, strip_accents='unicode', analyzer='char_wb', ngram_range=(2, 9)) tfidf7 = TfidfVectorizer(max_features=85000, strip_accents='unicode', analyzer='word',sublinear_tf=1, ngram_range=(1, 2),tokenizer = SnowballTokenizer()) tfidf8 = CountVectorizer(max_features=4900, strip_accents='unicode', analyzer='word', ngram_range=(1, 3),tokenizer = SnowballTokenizer()) tfidf9 = CountVectorizer(max_features=5200, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3),tokenizer = SnowballTokenizer()) tfidf10 = TfidfVectorizer(max_features=85000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',sublinear_tf=1, ngram_range=(1, 2),tokenizer = SnowballTokenizer()) tfidf11 = CountVectorizer(max_features=5200, strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}', binary=True, ngram_range=(1, 3),tokenizer = SnowballTokenizer()) tfidf12 = CountVectorizer(max_features=5200, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', binary=True, ngram_range=(1, 3),tokenizer = SnowballTokenizer() ) tfidf13 = CountVectorizer(max_features=5200, strip_accents='unicode', analyzer='word', binary=True, ngram_range=(1, 3),tokenizer = SnowballTokenizer()) vectorizers = [tfidf1,tfidf2,tfidf3,tfidf4,tfidf5,tfidf6, tfidf7,tfidf8,tfidf9,tfidf10,tfidf11,tfidf12,tfidf13] #vectorizers = [tfidf1,tfidf3,tfidf5,tfidf6] #vectorizers = [tfidf1] #comment = 'full, SnowballTokenizer no RF' use_lsa = 0 cv_split = 0.2 n = int(np.round(len(t['tweet'].tolist()))) train_end = int(np.round(n*(1-cv_split))) cv_beginning = int(np.round( n*(1-cv_split if cv_split > 0 else 0.8))) y = np.array(t.ix[:,4:]) train = t['tweet'].tolist()[0:train_end] cv_X_original = np.array(t['tweet'].tolist()[cv_beginning:]) cv_y = np.array(y[cv_beginning:]) if cv_split == 0: train = t['tweet'].tolist() else: y = y[0:int(np.round(len(t['tweet'].tolist())*(1-cv_split)))] prediction_grand_all = 0 predict_cv_grand_all = 0 list_predictions = [] list_predictions_test = [] for tfid in vectorizers: print 'fitting vectorizer...' tfid.fit(t['tweet'].tolist() + t2['tweet'].tolist()) print 'transforming train set...' X = tfid.transform(train) print 'transforming cv set...' cv_X = tfid.transform(cv_X_original) print 'transforming test set...' test = tfid.transform(t2['tweet']) clf1 = MultiTaskLasso() clf2 = AdaBoostRegressor(learning_rate = 1,n_estimators = 10) clf3 = RandomForestRegressor(max_depth = 20, n_estimators = 36, max_features = 100, n_jobs = 6) clf4 = Ridge() clfs = [clf4, clf3] lsa_classifier = [0, 1] prediction_all = 0 predict_cv_all = 0 for clf, use_lsa in zip(clfs,lsa_classifier): if use_lsa == 1: lsa = TruncatedSVD(n_components = 100) print 'fitting lsa...' lsa.fit(X, y) print 'transfomring with lsa...' X = lsa.transform(X) cv_X = lsa.transform(cv_X) test = lsa.transform(test) print 'normalizing....' norm = Normalizer() norm.fit(X, y) X = norm.transform(X, copy= False) test = norm.transform(test, copy= False) cv_X = norm.transform(cv_X, copy= False) else: fac = p.Categorical(t['state'].tolist() + t2['state'].tolist()) t_matrix = u.create_t_matrix(fac.labels) train_feat = t_matrix[0:train_end] cv_X_feat = t_matrix[cv_beginning:n] test_feat = t_matrix[n:] X = sparse.hstack([X, sparse.csr_matrix(train_feat)]) cv_X = sparse.hstack([cv_X, sparse.csr_matrix(cv_X_feat)]) test = sparse.hstack([test, sparse.csr_matrix(test_feat)]) t0 = time.time() print 'fitting...' clf.fit(X,y) print 'validating...' print 'Train error: {0}'.format(np.sqrt(np.sum(np.array(np.array(clf.predict(X))-y)**2)/ (X.shape[0]*24.0))) prediction = np.array(clf.predict(test)) prediction = np.abs(prediction*(prediction > 0)) prediction[prediction > 1] = 1 predict_cv = np.array(clf.predict(cv_X)) predict_cv = np.abs(predict_cv*(predict_cv > 0)) predict_cv[predict_cv > 1] = 1 list_predictions.append(predict_cv) list_predictions_test.append(prediction) print 'Cross validation error: {0}'.format(np.sqrt(np.sum(np.array(predict_cv-cv_y)**2)/ (cv_X.shape[0]*24.0))) predict_cv_all = predict_cv + predict_cv_all prediction_all = prediction + prediction_all print 'fitted model in {0} seconds'.format(np.round(time.time() - t0,2)) prediction_all /= len(clfs)*1.0 predict_cv_all /= len(clfs)*1.0 print 'Cross validation error ensemble: {0}'.format(np.sqrt(np.sum(np.array(predict_cv_all - cv_y)**2)/ (cv_X.shape[0]*24.0))) prediction_grand_all = prediction_all + prediction_grand_all predict_cv_grand_all = predict_cv_all + predict_cv_grand_all prediction_grand_all /= len(vectorizers)*1.0 predict_cv_grand_all /= len(vectorizers)*1.0 print 'Cross validation error grand ensemble: {0}'.format(np.sqrt(np.sum(np.array(predict_cv_grand_all - cv_y)**2)/ (cv_X.shape[0]*24.0))) #log.info(comment) #log.info(np.sqrt(np.sum(np.array(predict_cv_grand_all - cv_y)**2)/ (cv_X.shape[0]*24.0))) prediction = np.array(np.hstack([np.matrix(t2['id']).T, prediction_grand_all])) col = '%i,' + '%f,'*23 + '%f' np.savetxt('C:/data/crowdflower/sklearn_prediction.csv', prediction,col, delimiter=',') list_predictions.append(cv_y) pickle.dump(list_predictions, io.open('C:/data/crowdflower/predicts.txt','wb')) pickle.dump(list_predictions_test, io.open('C:/data/crowdflower/predicts_test.txt','wb'))