def load_data(): (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(x_train.shape[0], 28 * 28) x_test = x_test.reshape(x_test.shape[0], 28 * 28) y_train = LabelEncoder().fit_transform(y_train.reshape(-1,1)) y_test = LabelEncoder().fit_transform(y_test.reshape(-1,1)) return (x_train, y_train), (x_test, y_test)
def selection_catergory(category_f): result = [] for i in np.arange(len(category_f)): x = remove_all_nan[~remove_all_nan[category_f[i]].isna()] feature = LabelEncoder().fit_transform(x[category_f[i]]) label = x['RainTomorrow'] fstat, pval = chi2(feature.reshape(-1,1), label) mi = mutual_info_classif(feature.reshape(-1,1), label) result.append([category_f[i], round(fstat[0],5), round(pval[0],5), round(mi[0],5)]) return pd.DataFrame(result, columns =['Category_f', 'Chi2', 'Pval', 'MI'])
def One_hot(data): np.set_printoptions(threshold=1e6) # 输出时保证将所有元素输出 le_sex=LabelEncoder().fit(data) Sex_label=le_sex.transform(data) Sex_label= LabelEncoder().fit_transform(data) #fit_transform等价于fit和transform两个函数结合 ohe_sex=OneHotEncoder(sparse=False).fit(Sex_label.reshape(-1,1)) Sex_ohe=ohe_sex.transform(Sex_label.reshape(-1,1)) Sex_ohe_3 = OneHotEncoder(sparse=False).fit_transform(Sex_label.reshape((-1,1))) return Sex_ohe_3
def plot_decision_function(X, y, clf, ax=None): """Plot the boundary of the decision function of a classifier.""" from sklearn.preprocessing import LabelEncoder clf.fit(X, y) # create a grid to evaluate all possible samples plot_step = 0.02 feature_0_min, feature_0_max = (X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1) feature_1_min, feature_1_max = (X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1) xx, yy = np.meshgrid(np.arange(feature_0_min, feature_0_max, plot_step), np.arange(feature_1_min, feature_1_max, plot_step)) # compute the associated prediction Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = LabelEncoder().fit_transform(Z) Z = Z.reshape(xx.shape) # make the plot of the boundary and the data samples if ax is None: _, ax = plt.subplots() ax.contourf(xx, yy, Z, alpha=0.4) sns.scatterplot( data=pd.concat([X, y], axis=1), x=X.columns[0], y=X.columns[1], hue=y.name, ax=ax, )
def create_one_hot_encodings(df, col_name='Embarked', drop_original=False): ''' creates n new colls (binary), n = classes count boolean values create only 1 new col - 1/0 can work not only with strings, but with ints too (Pclass=1/2/3 -> 3 new binary cols) ''' import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder list_array_series = df[col_name] #print(list_array_series) int_encoded = LabelEncoder().fit_transform(list_array_series) #print(int_encoded) onehot_encoded = OneHotEncoder(sparse=False, categories='auto').fit_transform( int_encoded.reshape( len(int_encoded), 1)) classes_count = onehot_encoded.shape[1] if classes_count > 2: for class_num in range(classes_count): df[col_name + '_' + str(class_num)] = onehot_encoded[:, class_num].astype(int) else: df[col_name + '_' + str(0)] = onehot_encoded[:, 1].astype(int) if drop_original: df.drop([col_name], axis=1, level=None, inplace=True, errors='raise') return
class CSVDataset(Dataset): # load the dataset def __init__(self, path): # load csv file as a dataframe using pandas df = read_csv(path, header=None) # store inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] print("Input data shape:", np.shape(self.X)) print("Input label shape:", np.shape(self.y)) # Ensure input X values are floats self.X = self.X.astype('float32') # Encode target labels and ensure they are floats self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape(len(self.y), 1) #print("After reshaping, input label shape:", np.shape(self.y)) #print("Unique labels:", np.unique(self.y)) # Number of rows in dataset def __len__(self): return len(self.X) # Get a row at an index def __getitem__(self, idx): return [self.X[idx], self.y[idx]] # Get indices for train and test rows def get_splits(self, n_train=0.7): train_split = round(n_train * len(self.X)) test_split = len(self.X) - train_split return random_split(self, [train_split, test_split])
def plot_decision_function(fitted_classifier, range_features, ax=None): """Plot the boundary of the decision function of a classifier.""" from sklearn.preprocessing import LabelEncoder feature_names = list(range_features.keys()) # create a grid to evaluate all possible samples plot_step = 0.02 xx, yy = np.meshgrid( np.arange(*range_features[feature_names[0]], plot_step), np.arange(*range_features[feature_names[1]], plot_step), ) # compute the associated prediction Z = fitted_classifier.predict(np.c_[xx.ravel(), yy.ravel()]) Z = LabelEncoder().fit_transform(Z) Z = Z.reshape(xx.shape) # make the plot of the boundary and the data samples if ax is None: _, ax = plt.subplots() ax.contourf(xx, yy, Z, alpha=0.4, cmap="RdBu") ax.set_xlabel(feature_names[0]) ax.set_ylabel(feature_names[1]) return ax
class CSVDataset(Dataset): def __init__(self, path): df = pd.read_csv(path, header=None) self.X = df.values[:, :-1] self.y = df.values[:, -1] self.X = self.X.astype('float32') self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape(len(self.y), 1) def __len__(self): return len(self.X) def __getitem__(self, idx): return [self.X[idx], self.y[idx]] def get_splits(self, n_test=0.33): test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size return random_split(self, [train_size, test_size])
def _get_encoding(feature_name, all_feature_values, name_to_ind): """ Helper method to generate the one-hot encoding for the categorical features. Parameters ---------- all_feature_values feature_name name_to_ind: dict contains the mapping of the feature name to its position in the feature vector Returns ------- [(feature index (str), feature name (str), encoding (dict)), (...), ... ] """ endoced_features = [] # create the one-hot encoding integer_encoded = LabelEncoder().fit_transform(all_feature_values) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = OneHotEncoder( sparse=False).fit_transform(integer_encoded) # add the one-hot encoding to the dict endoced_features.append((name_to_ind[feature_name], feature_name, { all_feature_values[i]: encoding for i, encoding in enumerate(onehot_encoded) })) return endoced_features
def replace_nominal_column(col): """ Returns a One Hot Encoded ndarray of col """ labelledCol = LabelEncoder().fit_transform(col) labelledCol = labelledCol.reshape(labelledCol.shape[0], 1) return OneHotEncoder().fit_transform(labelledCol).toarray()
class CSVDataset(Dataset): # load the dataset def __init__(self, path): # load the csv file as a dataframe df = read_csv(path, header=None) # store the inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] # ensure input data is floats self.X = self.X.astype('float32') # label encode target and ensure the values are floats self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape((len(self.y), 1)) # number of rows in the dataset def __len__(self): return len(self.X) # get a row at an index def __getitem__(self, idx): return [self.X[idx], self.y[idx]] # get indexes for train and test rows def get_splits(self, n_test=0.33): # determine sizes test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size # calculate the split return random_split(self, [train_size, test_size])
class CSVDataset(Dataset): def __init__(self, path): ##Load the csv dataset as Dataframe df = read_csv(path, header=None) ### Store the inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] ## make them floats self.X = self.X.astype('float32') ## encode the targets self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape((len(self.y), 1)) ## number of rows in the dataset def __len__(self): return len(self.X) ## get a row from the dataset def __getitem__(self, index): return [self.X[index], self.y[index]] ### get index for test and train rows def get_splits(self, n_test=0.33): #determine sizes test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size ## calculate the split return random_split(self, [train_size, test_size])
def load_dynamic_monks(encode_labels=True, include_waverers=False, is_directed=True): module_path = dirname(__file__) n_time_steps = 3 Y = np.empty((n_time_steps, 18, 18), dtype=np.float64) for t in range(n_time_steps): Y[t] = np.loadtxt(join(module_path, 'raw_data', 'sampson_{}.npy'.format(t))) # load groups file_name = ('sampson_groups_waverers.txt' if include_waverers else 'sampson_groups.txt') with open(join(module_path, 'raw_data', file_name)) as f: groups = np.array([l.rstrip('\n') for l in f.readlines()]) if encode_labels: groups = LabelEncoder().fit_transform(groups) with open(join(module_path, 'raw_data', 'sampson_names.txt')) as f: names = np.array([l.rstrip('\n') for l in f.readlines()]) if not is_directed: Y += Y.transpose((0, 2, 1)) Y = (Y > 0).astype(np.float64) return Y, np.repeat(groups.reshape(1, -1), n_time_steps, axis=0), names
def plot_classification(model, X, y, ax=None): from sklearn.preprocessing import LabelEncoder model.fit(X, y) range_features = { feature_name: (X[feature_name].min() - 1, X[feature_name].max() + 1) for feature_name in X.columns } feature_names = list(range_features.keys()) # create a grid to evaluate all possible samples plot_step = 0.02 xx, yy = np.meshgrid( np.arange(*range_features[feature_names[0]], plot_step), np.arange(*range_features[feature_names[1]], plot_step), ) # compute the associated prediction Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = LabelEncoder().fit_transform(Z) Z = Z.reshape(xx.shape) # make the plot of the boundary and the data samples if ax is None: _, ax = plt.subplots() ax.contourf(xx, yy, Z, alpha=0.4, cmap="RdBu") sns.scatterplot(x=data_clf_columns[0], y=data_clf_columns[1], hue=target_clf_column, data=data_clf, ax=axs[0], palette=["tab:red", "tab:blue", "black"]) return ax
def knn_purity(adata, label_key, n_neighbors=30): """Computes KNN Purity metric for ``adata`` given the batch column name. Parameters ---------- adata: :class:`~anndata.AnnData` Annotated dataset. label_key: str Name of the column which contains information about different studies in ``adata.obs`` data frame. n_neighbors: int Number of nearest neighbors. Returns ------- score: float KNN purity score. A float between 0 and 1. """ adata = remove_sparsity(adata) labels = LabelEncoder().fit_transform(adata.obs[label_key].to_numpy()) nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X) indices = nbrs.kneighbors(adata.X, return_distance=False)[:, 1:] neighbors_labels = np.vectorize(lambda i: labels[i])(indices) # pre cell purity scores scores = ((neighbors_labels - labels.reshape(-1, 1)) == 0).mean(axis=1) res = [np.mean(scores[labels == i]) for i in np.unique(labels)] # per cell-type purity return np.mean(res)
def create_y(): excel_file = r'C:\Users\jesse\OneDrive\Desktop\Research\PD\decline_label.xlsx' excel_read = pd.read_excel(excel_file) excel_array = np.array(excel_read['Label']) label = LabelEncoder().fit_transform(excel_array) label = label.reshape(len(label), 1) onehot = OneHotEncoder(sparse=False).fit_transform(label) return onehot
def get_label(self): res = 'bbbccefecaaacddd' labels = [] for ch in res: labels.append(ch) # https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/ labels = LabelEncoder().fit_transform(labels) labels = labels.reshape(len(labels), 1) res = OneHotEncoder(sparse=False).fit_transform(labels) return res
def sample_data(X, y): if sourceType == SourceType.age: y = LabelEncoder().fit_transform( pd.cut(y, bins, labels=range(len(bins) - 1))) if sample_type == SampleType.under: X, y = under_sample(X, y) elif sample_type == SampleType.over: X, y = over_sample(X, y) else: if sourceType != SourceType.age: y = y.reshape(-1, 1) return X, y
def one_hot(y): y_list = list(np.squeeze(y)) y_dlist = list(set(y_list)) #去重 y_dlist.sort(key=y_list.index) y_d = LabelEncoder().fit_transform(y_dlist) y_onehot = OneHotEncoder(sparse=False).fit_transform(y_d.reshape( -1, 1)) #onehot转换 dic = {} for i in range(len(y_dlist)): key = y_dlist[i] value = y_onehot[i] dic[key] = value return y_onehot, dic #返回one-hot处理后的矩阵,和存储onehot和原始矩阵的字典
def getAnova(self, X, y): # y = y[:200] # X = X[:200] X = LabelEncoder().fit_transform(X.ravel()).reshape(*X.shape) # transform to binary # X = OneHotEncoder().fit_transform(X_int).toarray() n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) transform = feature_selection.SelectPercentile( feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))]) # ############################################################################# # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (5, 10, 20, 40, 60, 80, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using 1 CPU this_scores = cross_val_score(clf, X, y, n_jobs=1, verbose=10, cv=3) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the SVM-Anova varying the percentile of features selected' ) plt.xlabel('Percentile') plt.ylabel('Prediction rate') plt.axis('tight') plt.show()
def split_train_test(dataSet): cut = -1 if sourceType == SourceType.race else -2 X, y = (dataSet[:, :cut], dataSet[:, cut+1]) if sourceType == SourceType.age else (dataSet[:, :cut], dataSet[:, cut]) #if processType == ProcessType.name_c_tbn or processType == ProcessType.name or processType == ProcessType.tbn_c_name_att: #idx = np.array([[num for num in range(len(X))]]) #X = np.concatenate((X, idx.T), axis=1) #X = X[:, 1:] y = y.astype('int') #y = np.array([int(val) for val in y]) if sourceType == SourceType.age: y = LabelEncoder().fit_transform(pd.cut(y, bins, labels=range(len(bins)-1))) if sample_type == SampleType.under: X, y = under_sample(X, y) elif sample_type == SampleType.over: X, y = over_sample(X, y) else: if sourceType != SourceType.age: y = y.reshape(-1, 1) return X, y
class CSVDataset(Dataset): def __init__(self, path): df = read_csv(path, header=None) # load the csv file as a dataframe self.X = df.values[:, :-1] # store the inputs self.y = df.values[:, -1] # and outputs self.X = self.X.astype('float32') # ensure input data is floats self.y = LabelEncoder().fit_transform(self.y) # label target self.y = self.y.astype('float32') # ensure floats self.y = self.y.reshape((len(self.y), 1)) def __len__(self): return len(self.X) def __getitem__(self, idx): return [self.X[idx], self.y[idx]] def get_splits(self, n_test): test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size return random_split(self, (train_size, test_size)) # originally list
def my_func(x_mtx): arrs_to_conc = [] for i in xrange(x_mtx.shape[1]): arr = numpy.unique(x_mtx[:, i]) if len(arr) < 40000: digitized_arr = LabelEncoder().fit_transform(x_mtx[:, i]) if isinstance(arr[0], float) and math.isnan(arr[0]): nan_idx = digitized_arr == 0 digitized_arr[nan_idx] = len(arr) * 2 coded_arr = sparse.lil_matrix( OneHotEncoder(sparse=True, handle_unknown='ignore').fit_transform( digitized_arr.reshape(-1, 1))) arrs_to_conc.append(sparse.csr_matrix(coded_arr, dtype=float)) #print i,coded_arr.shape else: arrs_to_conc.append( sparse.csr_matrix(x_mtx[:, i].reshape(-1, 1), dtype=float)) return sparse.hstack(arrs_to_conc)
# # Features to include: # academics # expenses # no-of-students # percent-admittance # percent-enrolled # percent-financial-aid # sat # ###### final feature array # I use one hot encoder -- a collection of dummy variables for state XCat = OneHotEncoder().fit_transform(LEState.reshape(-1,1)).toarray() # continuous features contMatrix = univDataDF[['academics', 'expenses', 'no-of-students', 'percent-admittance', 'percent-enrolled', 'percent-financial-aid', 'sat']] XCont = np.array(contMatrix) X = np.hstack([XCont, XCat]) ###### SVM classifier
# remove stopwords dFrame[y] = dFrame[y].apply( lambda x: [item for item in x if item not in stopwords.words('english')]) # stemming dFrame[y] = dFrame[y].apply( lambda x: [nltk.stem.PorterStemmer().stem(y) for y in x]) # one hot vector i = 3 listOneHot = [] for y in cols: for x in dFrame[y]: integer_encoded = LabelEncoder().fit_transform(x) onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) listOneHot.append(onehot_encoder.fit_transform(integer_encoded)) dFrame.insert(i, y + 'OneHot', listOneHot, True) i += 1 listOneHot = [] for y in cols: # joining with " " dFrame[y] = dFrame[y].str.join(" ") corpus = list(dFrame['description']) embedder = SentenceTransformer('bert-base-nli-mean-tokens') corpus_embeddings = embedder.encode(corpus) # Query sentences: queries = list(dFrame['tags']) query_embeddings = embedder.encode(queries)
for i in range(data.shape[0]): c = single_autocorr(data, lag) corrs.append(c) corr = np.array(corrs) corr = corr.reshape(-1, 1) corr = np.expand_dims(corr, -1) corr = np.repeat(corr, series_length, axis=1) return corr datetime.datetime.strptime(train.columns.values[0], '%Y-%m-%d').strftime('%a') weekdays = [datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%a') for date in train.columns.values[:-4]] day_one_hot = LabelEncoder().fit_transform(weekdays) day_one_hot = day_one_hot.reshape(-1, 1) day_one_hot = OneHotEncoder(sparse=False).fit_transform(day_one_hot) day_one_hot = np.expand_dims(day_one_hot, 0) agent_int = LabelEncoder().fit(train['Agent']) agent_enc = agent_int.transform(train['Agent']) agent_enc = agent_enc.reshape(-1, 1) agent_one_hot = OneHotEncoder(sparse=False).fit(agent_enc) del agent_enc page_int = LabelEncoder().fit(train['Sub_Page']) page_enc = page_int.transform(train['Sub_Page']) page_enc = page_enc.reshape(-1, 1) page_one_hot = OneHotEncoder(sparse=False).fit(page_enc)
def pre_processing(file_prefix='training'): loaded_data = pd.read_csv(file_prefix + '_data.csv') no_items = len(loaded_data) print("length of loaded data ", len(loaded_data)) # To make encoding uniform we will put test data as well. Then calculate # one hot encoding if file_prefix == 'trial' or file_prefix == 'test': extended = pd.read_csv('training_data.csv') loaded_data = pd.concat([loaded_data, extended], axis=0) loaded_data.set_index(pd.Index(range(len(loaded_data))), inplace=True) # print(loaded_data.columns, extended.columns) # a = lambda x: ' '.join(x['word'].to_list()) # sentence_data = loaded_data.groupby('sentence_id').apply(a) # for i in sentence_data.to_list(): # print(i) val = loaded_data['word'].apply(postagger) # print (np.ravel(val)) otag = [i[0] for i in np.ravel(val)] utag = [i[1] for i in np.ravel(val)] # print(otag, utag) loaded_data['otag'] = otag loaded_data['utag'] = utag #change the categorical utag to the one hot encoded value encoded = LabelEncoder().fit_transform(loaded_data['utag']) encoded = encoded.reshape(-1, 1) encoded_vector = OneHotEncoder(sparse=False).fit_transform(encoded) loaded_data = pd.concat([ loaded_data, pd.DataFrame(encoded_vector, columns=['utag_e_' + str(i) for i in range(12)]) ], axis=1) # Need to remove the hard coding from # the code print("Length of the loaded_data ", len(loaded_data)) loaded_data['toklen'] = loaded_data['word'].apply(len) # loaded_data['crossreftime'] = loaded_data['TRT'] - loaded_data['FFD'] # loaded_data['GPT-FFD'] = loaded_data['GPT'] - loaded_data['FFD'] # loaded_data['TRT-GPT'] = loaded_data['TRT'] - loaded_data['GPT'] if file_prefix == 'trial' or file_prefix == 'test': loaded_data = loaded_data[:no_items] print("Length of the loaded_data ", len(loaded_data)) all_embedding = np.array([]) temp = np.array([]) for idx, word in enumerate(tqdm.tqdm(loaded_data['word'].to_list())): # print("Processing word no ", str(idx)) if idx % 1000 == 0: print("Appending the big array") all_embedding = np.append(all_embedding, temp) temp = np.array([]) word = word.translate(str.maketrans('', '', punct)) word = re.sub(eos_pattern, '', word) if word != '': val = BERTembed(word) # all_embedding.append(val) else: # print("Processing word no/ Missing index ", str(idx)) val = np.zeros(768) temp = np.append(temp, val) all_embedding = np.append(all_embedding, temp) an = loaded_data[:] d = pd.DataFrame(np.reshape(all_embedding, (-1, 768))) an = pd.concat([an, d], axis=1) # used for the visualization purpose an.to_csv(file_prefix + '_pos_tagged.csv') loaded_data = pd.read_csv(file_prefix + '_pos_tagged.csv') val = loaded_data['word'].apply(wordnet_) loaded_data['pps'] = val x = lambda tr: len(pronouncing.phones_for_word(tr)[0].split(' ')) if len( pronouncing.phones_for_word(tr)) > 0 else 0 val = loaded_data['word'].apply(x) loaded_data['phonem'] = val loaded_data.to_csv(file_prefix + '_pos_tagged.csv', index=False)
logits = gcn1(g1) val_loss = criterion(logits[g1.val_mask], g1.y[g1.val_mask]) pred_val = np.argmax(logits[g1.val_mask].cpu().numpy(), axis=1) pred_train = np.argmax(logits[g1.train_mask].cpu().numpy(), axis=1) acc_val = accuracy_score(g1.y.cpu()[g1.val_mask], pred_val) acc_train = accuracy_score(g1.y.cpu()[g1.train_mask], pred_train) print( f"[{epoch + 1:{length}}] loss: {loss.item(): .3f}, " f"training accuracy: {acc_train: .3f}, val_accuracy: {acc_val: .3f}" ) with th.no_grad(): hierarchy_true1 = th.nn.functional.softmax( gcn1(g1)[g1.n_vocab:]).cpu().numpy() hierarchy1 = OneHotEncoder(sparse=False).fit_transform(y_top1.reshape(-1, 1)) print(f"shape of hierarchy: {hierarchy1.shape}") print(f"shape of hierarchy_true: {hierarchy_true1.shape}") del gcn1 del g1 g2 = t2g.fit_transform(x, y_top2, test_idx=test_idx, val_idx=val_idx, hierarchy_feats=hierarchy1) gcn2 = model(g2.x.shape[1], len(np.unique(y_top2)), n_hidden_gcn=n_hidden, dropout=dropout)
print(np.log1p(2)) testdata = pd.DataFrame({ 'pet': ['cat', 'dog', 'dog', 'fish'], 'age': [4, 6, 3, 3], 'salary': [4, 1, 1, 1] }) a1 = OneHotEncoder(sparse=False).fit_transform(testdata[['age']]) a2 = OneHotEncoder(sparse=False).fit_transform(testdata[['salary']]) print("----------------------------+++") print(testdata) final_output = np.hstack((a1, a2)) print("----------------------------+++") print(final_output) print("----------------------------+++") a = LabelEncoder().fit_transform(testdata['pet']) print(a) print(a.reshape(-1, 1).shape) OneHotEncoder(sparse=False).fit_transform(a.reshape( -1, 1)) # 注意: 这里把 a 用 reshape 转换成 2-D array # 方法二: 直接用 LabelBinarizer() a3 = LabelBinarizer().fit_transform(testdata['pet']) print(a3) a4 = pd.get_dummies(testdata, columns=testdata.columns) print(a4)
data_dict['y'].extend(pickle_data['y']) data_dict['track_paths'].extend(pickle_data['track_paths']) data_dict['X'] = np.array(data_dict['X']) data_dict['y'] = np.array(data_dict['y']) with open(PICKLE_DIR + 'finalé.pkl', 'wb') as final_pickle: pickle.dump(data_dict, final_pickle) if __name__ == "__main__": genres_data = pd.read_csv(METADATA_DIR + "genres.csv", index_col = 0) tracks = cleanTracksData(METADATA_DIR + "tracks2.csv") genresDict = {} # One hot encoding genre list, which is our output labelEncoded = LabelEncoder().fit_transform(GENRES) labelEncoded = labelEncoded.reshape(len(labelEncoded), 1) oneHotEncoder = OneHotEncoder(sparse=False) oneHotEncoded = oneHotEncoder.fit_transform(labelEncoded) for i, genre in enumerate(GENRES): genresDict[genre] = np.array(oneHotEncoded[i]) trackIDs = getTrackIDs(AUDIO_DIR, tracks) # text file made to do some quality control test on excel np.savetxt(MAIN_DIR + "trackIDs.csv", trackIDs, delimiter=",", fmt='%s') if not os.path.exists(PICKLE_DIR): try: os.makedirs(PICKLE_DIR) except: pass