def load_data():
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = x_train.reshape(x_train.shape[0], 28 * 28)
    x_test = x_test.reshape(x_test.shape[0], 28 * 28)
    y_train = LabelEncoder().fit_transform(y_train.reshape(-1,1))
    y_test = LabelEncoder().fit_transform(y_test.reshape(-1,1))
    return (x_train, y_train), (x_test, y_test)
def selection_catergory(category_f):
    result = []
    for i in np.arange(len(category_f)):
        x = remove_all_nan[~remove_all_nan[category_f[i]].isna()]
        feature = LabelEncoder().fit_transform(x[category_f[i]])
        label = x['RainTomorrow']
        fstat, pval = chi2(feature.reshape(-1,1), label)
        mi = mutual_info_classif(feature.reshape(-1,1), label)
        result.append([category_f[i], round(fstat[0],5), round(pval[0],5), round(mi[0],5)])
    return pd.DataFrame(result, columns =['Category_f', 'Chi2', 'Pval', 'MI'])
Пример #3
0
def One_hot(data):
    np.set_printoptions(threshold=1e6) # 输出时保证将所有元素输出
    le_sex=LabelEncoder().fit(data)
    Sex_label=le_sex.transform(data)
    Sex_label= LabelEncoder().fit_transform(data) #fit_transform等价于fit和transform两个函数结合
    ohe_sex=OneHotEncoder(sparse=False).fit(Sex_label.reshape(-1,1))
    Sex_ohe=ohe_sex.transform(Sex_label.reshape(-1,1))
    Sex_ohe_3 = OneHotEncoder(sparse=False).fit_transform(Sex_label.reshape((-1,1)))

    return Sex_ohe_3
Пример #4
0
def plot_decision_function(X, y, clf, ax=None):
    """Plot the boundary of the decision function of a classifier."""
    from sklearn.preprocessing import LabelEncoder

    clf.fit(X, y)

    # create a grid to evaluate all possible samples
    plot_step = 0.02
    feature_0_min, feature_0_max = (X.iloc[:, 0].min() - 1,
                                    X.iloc[:, 0].max() + 1)
    feature_1_min, feature_1_max = (X.iloc[:, 1].min() - 1,
                                    X.iloc[:, 1].max() + 1)
    xx, yy = np.meshgrid(np.arange(feature_0_min, feature_0_max, plot_step),
                         np.arange(feature_1_min, feature_1_max, plot_step))

    # compute the associated prediction
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = LabelEncoder().fit_transform(Z)
    Z = Z.reshape(xx.shape)

    # make the plot of the boundary and the data samples
    if ax is None:
        _, ax = plt.subplots()
    ax.contourf(xx, yy, Z, alpha=0.4)
    sns.scatterplot(
        data=pd.concat([X, y], axis=1),
        x=X.columns[0],
        y=X.columns[1],
        hue=y.name,
        ax=ax,
    )
Пример #5
0
def create_one_hot_encodings(df, col_name='Embarked', drop_original=False):
    '''
    creates n new colls (binary), n = classes count
    boolean values create only 1 new col - 1/0
    can work not only with strings, but with ints too (Pclass=1/2/3 -> 3 new binary cols)
    '''
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    list_array_series = df[col_name]
    #print(list_array_series)
    int_encoded = LabelEncoder().fit_transform(list_array_series)
    #print(int_encoded)
    onehot_encoded = OneHotEncoder(sparse=False,
                                   categories='auto').fit_transform(
                                       int_encoded.reshape(
                                           len(int_encoded), 1))
    classes_count = onehot_encoded.shape[1]
    if classes_count > 2:
        for class_num in range(classes_count):
            df[col_name + '_' +
               str(class_num)] = onehot_encoded[:, class_num].astype(int)
    else:
        df[col_name + '_' + str(0)] = onehot_encoded[:, 1].astype(int)
    if drop_original:
        df.drop([col_name], axis=1, level=None, inplace=True, errors='raise')
    return
Пример #6
0
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load csv file as a dataframe using pandas
        df = read_csv(path, header=None)
        # store inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        print("Input data shape:", np.shape(self.X))
        print("Input label shape:", np.shape(self.y))
        # Ensure input X values are floats
        self.X = self.X.astype('float32')
        # Encode target labels and ensure they are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape(len(self.y), 1)
        #print("After reshaping, input label shape:", np.shape(self.y))
        #print("Unique labels:", np.unique(self.y))

    # Number of rows in dataset
    def __len__(self):
        return len(self.X)

    # Get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # Get indices for train and test rows
    def get_splits(self, n_train=0.7):
        train_split = round(n_train * len(self.X))
        test_split = len(self.X) - train_split
        return random_split(self, [train_split, test_split])
def plot_decision_function(fitted_classifier, range_features, ax=None):
    """Plot the boundary of the decision function of a classifier."""
    from sklearn.preprocessing import LabelEncoder

    feature_names = list(range_features.keys())
    # create a grid to evaluate all possible samples
    plot_step = 0.02
    xx, yy = np.meshgrid(
        np.arange(*range_features[feature_names[0]], plot_step),
        np.arange(*range_features[feature_names[1]], plot_step),
    )

    # compute the associated prediction
    Z = fitted_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = LabelEncoder().fit_transform(Z)
    Z = Z.reshape(xx.shape)

    # make the plot of the boundary and the data samples
    if ax is None:
        _, ax = plt.subplots()
    ax.contourf(xx, yy, Z, alpha=0.4, cmap="RdBu")
    ax.set_xlabel(feature_names[0])
    ax.set_ylabel(feature_names[1])

    return ax
class CSVDataset(Dataset):
    def __init__(self, path):

        df = pd.read_csv(path, header=None)

        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]

        self.X = self.X.astype('float32')

        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape(len(self.y), 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test=0.33):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size

        return random_split(self, [train_size, test_size])
Пример #9
0
    def _get_encoding(feature_name, all_feature_values, name_to_ind):
        """
        Helper method to generate the one-hot encoding for the categorical features.

        Parameters
        ----------
        all_feature_values
        feature_name
        name_to_ind: dict
            contains the mapping of the feature name to its position in the feature vector

        Returns
        -------
        [(feature index (str), feature name (str), encoding (dict)), (...), ... ]
        """
        endoced_features = []

        # create the one-hot encoding
        integer_encoded = LabelEncoder().fit_transform(all_feature_values)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = OneHotEncoder(
            sparse=False).fit_transform(integer_encoded)

        # add the one-hot encoding to the dict
        endoced_features.append((name_to_ind[feature_name], feature_name, {
            all_feature_values[i]: encoding
            for i, encoding in enumerate(onehot_encoded)
        }))

        return endoced_features
Пример #10
0
def replace_nominal_column(col):
    """
    Returns a One Hot Encoded ndarray of col
    """
    labelledCol = LabelEncoder().fit_transform(col)
    labelledCol = labelledCol.reshape(labelledCol.shape[0], 1)
    return OneHotEncoder().fit_transform(labelledCol).toarray()
Пример #11
0
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path, header=None)
        # store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        # label encode target and ensure the values are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    # number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])
Пример #12
0
class CSVDataset(Dataset):
    def __init__(self, path):
        ##Load the csv dataset as Dataframe
        df = read_csv(path, header=None)
        ### Store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        ## make them floats
        self.X = self.X.astype('float32')
        ## encode the targets
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    ## number of rows in the dataset
    def __len__(self):
        return len(self.X)

    ## get a row from the dataset

    def __getitem__(self, index):
        return [self.X[index], self.y[index]]

    ### get index for test and train rows
    def get_splits(self, n_test=0.33):
        #determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        ## calculate the split
        return random_split(self, [train_size, test_size])
Пример #13
0
def load_dynamic_monks(encode_labels=True, include_waverers=False,
                       is_directed=True):
    module_path = dirname(__file__)

    n_time_steps = 3
    Y = np.empty((n_time_steps, 18, 18), dtype=np.float64)

    for t in range(n_time_steps):
        Y[t] = np.loadtxt(join(module_path, 'raw_data',
                               'sampson_{}.npy'.format(t)))
    # load groups
    file_name = ('sampson_groups_waverers.txt' if include_waverers else
                 'sampson_groups.txt')

    with open(join(module_path, 'raw_data', file_name)) as f:
        groups = np.array([l.rstrip('\n') for l in f.readlines()])

    if encode_labels:
        groups = LabelEncoder().fit_transform(groups)

    with open(join(module_path, 'raw_data', 'sampson_names.txt')) as f:
        names = np.array([l.rstrip('\n') for l in f.readlines()])

    if not is_directed:
        Y += Y.transpose((0, 2, 1))
        Y = (Y > 0).astype(np.float64)

    return Y, np.repeat(groups.reshape(1, -1), n_time_steps, axis=0), names
def plot_classification(model, X, y, ax=None):
    from sklearn.preprocessing import LabelEncoder
    model.fit(X, y)

    range_features = {
        feature_name: (X[feature_name].min() - 1, X[feature_name].max() + 1)
        for feature_name in X.columns
    }
    feature_names = list(range_features.keys())
    # create a grid to evaluate all possible samples
    plot_step = 0.02
    xx, yy = np.meshgrid(
        np.arange(*range_features[feature_names[0]], plot_step),
        np.arange(*range_features[feature_names[1]], plot_step),
    )

    # compute the associated prediction
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = LabelEncoder().fit_transform(Z)
    Z = Z.reshape(xx.shape)

    # make the plot of the boundary and the data samples
    if ax is None:
        _, ax = plt.subplots()
    ax.contourf(xx, yy, Z, alpha=0.4, cmap="RdBu")
    sns.scatterplot(x=data_clf_columns[0],
                    y=data_clf_columns[1],
                    hue=target_clf_column,
                    data=data_clf,
                    ax=axs[0],
                    palette=["tab:red", "tab:blue", "black"])

    return ax
Пример #15
0
def knn_purity(adata, label_key, n_neighbors=30):
    """Computes KNN Purity metric for ``adata`` given the batch column name.

        Parameters
        ----------
        adata: :class:`~anndata.AnnData`
            Annotated dataset.
        label_key: str
            Name of the column which contains information about different studies in ``adata.obs`` data frame.
        n_neighbors: int
            Number of nearest neighbors.
        Returns
        -------
        score: float
            KNN purity score. A float between 0 and 1.

    """
    adata = remove_sparsity(adata)
    labels = LabelEncoder().fit_transform(adata.obs[label_key].to_numpy())

    nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X)
    indices = nbrs.kneighbors(adata.X, return_distance=False)[:, 1:]
    neighbors_labels = np.vectorize(lambda i: labels[i])(indices)

    # pre cell purity scores
    scores = ((neighbors_labels - labels.reshape(-1, 1)) == 0).mean(axis=1)
    res = [np.mean(scores[labels == i])
           for i in np.unique(labels)]  # per cell-type purity

    return np.mean(res)
Пример #16
0
def create_y():
    excel_file = r'C:\Users\jesse\OneDrive\Desktop\Research\PD\decline_label.xlsx'
    excel_read = pd.read_excel(excel_file)
    excel_array = np.array(excel_read['Label'])
    label = LabelEncoder().fit_transform(excel_array)
    label = label.reshape(len(label), 1)
    onehot = OneHotEncoder(sparse=False).fit_transform(label)
    return onehot
Пример #17
0
 def get_label(self):
     res = 'bbbccefecaaacddd'
     labels = []
     for ch in res:
         labels.append(ch)
     # https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
     labels = LabelEncoder().fit_transform(labels)
     labels = labels.reshape(len(labels), 1)
     res = OneHotEncoder(sparse=False).fit_transform(labels)
     return res
Пример #18
0
def sample_data(X, y):
    if sourceType == SourceType.age:
        y = LabelEncoder().fit_transform(
            pd.cut(y, bins, labels=range(len(bins) - 1)))
    if sample_type == SampleType.under:
        X, y = under_sample(X, y)
    elif sample_type == SampleType.over:
        X, y = over_sample(X, y)
    else:
        if sourceType != SourceType.age:
            y = y.reshape(-1, 1)
    return X, y
Пример #19
0
def one_hot(y):
    y_list = list(np.squeeze(y))
    y_dlist = list(set(y_list))  #去重
    y_dlist.sort(key=y_list.index)
    y_d = LabelEncoder().fit_transform(y_dlist)
    y_onehot = OneHotEncoder(sparse=False).fit_transform(y_d.reshape(
        -1, 1))  #onehot转换
    dic = {}
    for i in range(len(y_dlist)):
        key = y_dlist[i]
        value = y_onehot[i]
        dic[key] = value
    return y_onehot, dic  #返回one-hot处理后的矩阵,和存储onehot和原始矩阵的字典
Пример #20
0
    def getAnova(self, X, y):

        # y = y[:200]
        # X = X[:200]
        X = LabelEncoder().fit_transform(X.ravel()).reshape(*X.shape)
        # transform to binary
        # X = OneHotEncoder().fit_transform(X_int).toarray()

        n_samples = len(y)
        X = X.reshape((n_samples, -1))
        # add 200 non-informative features
        X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

        transform = feature_selection.SelectPercentile(
            feature_selection.f_classif)

        clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])

        # #############################################################################
        # Plot the cross-validation score as a function of percentile of features
        score_means = list()
        score_stds = list()
        percentiles = (5, 10, 20, 40, 60, 80, 100)

        for percentile in percentiles:
            clf.set_params(anova__percentile=percentile)
            # Compute cross-validation score using 1 CPU
            this_scores = cross_val_score(clf,
                                          X,
                                          y,
                                          n_jobs=1,
                                          verbose=10,
                                          cv=3)
            score_means.append(this_scores.mean())
            score_stds.append(this_scores.std())

        plt.errorbar(percentiles, score_means, np.array(score_stds))

        plt.title(
            'Performance of the SVM-Anova varying the percentile of features selected'
        )
        plt.xlabel('Percentile')
        plt.ylabel('Prediction rate')

        plt.axis('tight')
        plt.show()
Пример #21
0
def split_train_test(dataSet):
    cut = -1 if sourceType == SourceType.race else -2
    X, y = (dataSet[:, :cut], dataSet[:, cut+1]) if sourceType == SourceType.age else (dataSet[:, :cut], dataSet[:, cut])
    #if processType == ProcessType.name_c_tbn or processType == ProcessType.name or  processType == ProcessType.tbn_c_name_att:
    #idx = np.array([[num for num in range(len(X))]])
    #X = np.concatenate((X, idx.T), axis=1)
    #X = X[:, 1:]
    y = y.astype('int')
    #y = np.array([int(val) for val in y])
    if sourceType == SourceType.age:
        y = LabelEncoder().fit_transform(pd.cut(y, bins, labels=range(len(bins)-1)))
    if sample_type == SampleType.under:
        X, y = under_sample(X, y)
    elif sample_type == SampleType.over:
        X, y = over_sample(X, y)
    else:
        if sourceType != SourceType.age:
            y = y.reshape(-1, 1)
    return X, y
Пример #22
0
class CSVDataset(Dataset):
    def __init__(self, path):
        df = read_csv(path, header=None)  # load the csv file as a dataframe
        self.X = df.values[:, :-1]  # store the inputs
        self.y = df.values[:, -1]  # and outputs
        self.X = self.X.astype('float32')  # ensure input data is floats
        self.y = LabelEncoder().fit_transform(self.y)  # label target
        self.y = self.y.astype('float32')  # ensure floats
        self.y = self.y.reshape((len(self.y), 1))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        return random_split(self, (train_size, test_size))  # originally list
Пример #23
0
def my_func(x_mtx):
    arrs_to_conc = []
    for i in xrange(x_mtx.shape[1]):
        arr = numpy.unique(x_mtx[:, i])

        if len(arr) < 40000:
            digitized_arr = LabelEncoder().fit_transform(x_mtx[:, i])
            if isinstance(arr[0], float) and math.isnan(arr[0]):
                nan_idx = digitized_arr == 0
                digitized_arr[nan_idx] = len(arr) * 2
            coded_arr = sparse.lil_matrix(
                OneHotEncoder(sparse=True,
                              handle_unknown='ignore').fit_transform(
                                  digitized_arr.reshape(-1, 1)))

            arrs_to_conc.append(sparse.csr_matrix(coded_arr, dtype=float))
            #print i,coded_arr.shape

        else:
            arrs_to_conc.append(
                sparse.csr_matrix(x_mtx[:, i].reshape(-1, 1), dtype=float))

    return sparse.hstack(arrs_to_conc)
#
# Features to include:
#    academics
#    expenses
#    no-of-students
#    percent-admittance
#    percent-enrolled
#    percent-financial-aid
#    sat
#



###### final feature array
# I use one hot encoder -- a collection of dummy variables for state
XCat = OneHotEncoder().fit_transform(LEState.reshape(-1,1)).toarray()
# continuous features
contMatrix = univDataDF[['academics',
                         'expenses',
                         'no-of-students',
                         'percent-admittance',
                         'percent-enrolled',
                         'percent-financial-aid',
                         'sat']]
XCont = np.array(contMatrix)
X = np.hstack([XCont, XCat])




###### SVM classifier
Пример #25
0
    # remove stopwords
    dFrame[y] = dFrame[y].apply(
        lambda x:
        [item for item in x if item not in stopwords.words('english')])
    # stemming
    dFrame[y] = dFrame[y].apply(
        lambda x: [nltk.stem.PorterStemmer().stem(y) for y in x])

# one hot vector
i = 3
listOneHot = []
for y in cols:
    for x in dFrame[y]:
        integer_encoded = LabelEncoder().fit_transform(x)
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        listOneHot.append(onehot_encoder.fit_transform(integer_encoded))
    dFrame.insert(i, y + 'OneHot', listOneHot, True)
    i += 1
    listOneHot = []

for y in cols:
    # joining with " "
    dFrame[y] = dFrame[y].str.join(" ")

corpus = list(dFrame['description'])
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
corpus_embeddings = embedder.encode(corpus)
# Query sentences:
queries = list(dFrame['tags'])
query_embeddings = embedder.encode(queries)
Пример #26
0
    for i in range(data.shape[0]):
        c = single_autocorr(data, lag)
        corrs.append(c)
    corr = np.array(corrs)
    corr = corr.reshape(-1, 1)
    corr = np.expand_dims(corr, -1)
    corr = np.repeat(corr, series_length, axis=1)
    return corr


datetime.datetime.strptime(train.columns.values[0], '%Y-%m-%d').strftime('%a')
weekdays = [datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%a')
            for date in train.columns.values[:-4]]

day_one_hot = LabelEncoder().fit_transform(weekdays)
day_one_hot = day_one_hot.reshape(-1, 1)
day_one_hot = OneHotEncoder(sparse=False).fit_transform(day_one_hot)
day_one_hot = np.expand_dims(day_one_hot, 0)

agent_int = LabelEncoder().fit(train['Agent'])
agent_enc = agent_int.transform(train['Agent'])
agent_enc = agent_enc.reshape(-1, 1)
agent_one_hot = OneHotEncoder(sparse=False).fit(agent_enc)

del agent_enc

page_int = LabelEncoder().fit(train['Sub_Page'])
page_enc = page_int.transform(train['Sub_Page'])
page_enc = page_enc.reshape(-1, 1)
page_one_hot = OneHotEncoder(sparse=False).fit(page_enc)
Пример #27
0
def pre_processing(file_prefix='training'):
    loaded_data = pd.read_csv(file_prefix + '_data.csv')
    no_items = len(loaded_data)
    print("length of loaded data ", len(loaded_data))
    # To make encoding uniform we will put test data as well. Then calculate
    # one hot encoding
    if file_prefix == 'trial' or file_prefix == 'test':
        extended = pd.read_csv('training_data.csv')
        loaded_data = pd.concat([loaded_data, extended], axis=0)
        loaded_data.set_index(pd.Index(range(len(loaded_data))), inplace=True)

    # print(loaded_data.columns, extended.columns)

    # a = lambda x: ' '.join(x['word'].to_list())
    # sentence_data = loaded_data.groupby('sentence_id').apply(a)
    # for i in sentence_data.to_list():
    #     print(i)

    val = loaded_data['word'].apply(postagger)
    # print (np.ravel(val))
    otag = [i[0] for i in np.ravel(val)]
    utag = [i[1] for i in np.ravel(val)]
    # print(otag, utag)
    loaded_data['otag'] = otag
    loaded_data['utag'] = utag
    #change the categorical utag to the one hot encoded value
    encoded = LabelEncoder().fit_transform(loaded_data['utag'])
    encoded = encoded.reshape(-1, 1)
    encoded_vector = OneHotEncoder(sparse=False).fit_transform(encoded)
    loaded_data = pd.concat([
        loaded_data,
        pd.DataFrame(encoded_vector,
                     columns=['utag_e_' + str(i) for i in range(12)])
    ],
                            axis=1)
    # Need to remove the hard coding from
    # the code
    print("Length of the loaded_data ", len(loaded_data))
    loaded_data['toklen'] = loaded_data['word'].apply(len)
    # loaded_data['crossreftime'] = loaded_data['TRT'] - loaded_data['FFD']
    # loaded_data['GPT-FFD'] = loaded_data['GPT'] - loaded_data['FFD']
    # loaded_data['TRT-GPT'] = loaded_data['TRT'] - loaded_data['GPT']
    if file_prefix == 'trial' or file_prefix == 'test':
        loaded_data = loaded_data[:no_items]
    print("Length of the loaded_data ", len(loaded_data))
    all_embedding = np.array([])
    temp = np.array([])
    for idx, word in enumerate(tqdm.tqdm(loaded_data['word'].to_list())):
        # print("Processing word no ", str(idx))
        if idx % 1000 == 0:
            print("Appending the big array")
            all_embedding = np.append(all_embedding, temp)
            temp = np.array([])
        word = word.translate(str.maketrans('', '', punct))
        word = re.sub(eos_pattern, '', word)
        if word != '':
            val = BERTembed(word)
            # all_embedding.append(val)
        else:
            # print("Processing word no/ Missing index ", str(idx))
            val = np.zeros(768)
        temp = np.append(temp, val)
    all_embedding = np.append(all_embedding, temp)
    an = loaded_data[:]
    d = pd.DataFrame(np.reshape(all_embedding, (-1, 768)))
    an = pd.concat([an, d], axis=1)
    # used for the visualization purpose
    an.to_csv(file_prefix + '_pos_tagged.csv')
    loaded_data = pd.read_csv(file_prefix + '_pos_tagged.csv')
    val = loaded_data['word'].apply(wordnet_)
    loaded_data['pps'] = val
    x = lambda tr: len(pronouncing.phones_for_word(tr)[0].split(' ')) if len(
        pronouncing.phones_for_word(tr)) > 0 else 0
    val = loaded_data['word'].apply(x)
    loaded_data['phonem'] = val
    loaded_data.to_csv(file_prefix + '_pos_tagged.csv', index=False)
Пример #28
0
        logits = gcn1(g1)
        val_loss = criterion(logits[g1.val_mask], g1.y[g1.val_mask])
        pred_val = np.argmax(logits[g1.val_mask].cpu().numpy(), axis=1)
        pred_train = np.argmax(logits[g1.train_mask].cpu().numpy(), axis=1)
        acc_val = accuracy_score(g1.y.cpu()[g1.val_mask], pred_val)
        acc_train = accuracy_score(g1.y.cpu()[g1.train_mask], pred_train)
        print(
            f"[{epoch + 1:{length}}] loss: {loss.item(): .3f}, "
            f"training accuracy: {acc_train: .3f}, val_accuracy: {acc_val: .3f}"
        )

with th.no_grad():
    hierarchy_true1 = th.nn.functional.softmax(
        gcn1(g1)[g1.n_vocab:]).cpu().numpy()

hierarchy1 = OneHotEncoder(sparse=False).fit_transform(y_top1.reshape(-1, 1))
print(f"shape of hierarchy: {hierarchy1.shape}")
print(f"shape of hierarchy_true: {hierarchy_true1.shape}")

del gcn1
del g1

g2 = t2g.fit_transform(x,
                       y_top2,
                       test_idx=test_idx,
                       val_idx=val_idx,
                       hierarchy_feats=hierarchy1)
gcn2 = model(g2.x.shape[1],
             len(np.unique(y_top2)),
             n_hidden_gcn=n_hidden,
             dropout=dropout)
Пример #29
0
print(np.log1p(2))

testdata = pd.DataFrame({
    'pet': ['cat', 'dog', 'dog', 'fish'],
    'age': [4, 6, 3, 3],
    'salary': [4, 1, 1, 1]
})
a1 = OneHotEncoder(sparse=False).fit_transform(testdata[['age']])
a2 = OneHotEncoder(sparse=False).fit_transform(testdata[['salary']])
print("----------------------------+++")
print(testdata)
final_output = np.hstack((a1, a2))
print("----------------------------+++")
print(final_output)

print("----------------------------+++")
a = LabelEncoder().fit_transform(testdata['pet'])
print(a)
print(a.reshape(-1, 1).shape)
OneHotEncoder(sparse=False).fit_transform(a.reshape(
    -1, 1))  # 注意: 这里把 a 用 reshape 转换成 2-D array

# 方法二: 直接用 LabelBinarizer()

a3 = LabelBinarizer().fit_transform(testdata['pet'])
print(a3)

a4 = pd.get_dummies(testdata, columns=testdata.columns)

print(a4)
            data_dict['y'].extend(pickle_data['y'])
            data_dict['track_paths'].extend(pickle_data['track_paths'])

    data_dict['X'] = np.array(data_dict['X'])
    data_dict['y'] = np.array(data_dict['y'])
    with open(PICKLE_DIR + 'finalé.pkl', 'wb') as final_pickle:
        pickle.dump(data_dict, final_pickle)

if __name__ == "__main__":
    genres_data = pd.read_csv(METADATA_DIR + "genres.csv", index_col = 0)
    tracks = cleanTracksData(METADATA_DIR + "tracks2.csv")
    genresDict = {}

    # One hot encoding genre list, which is our output
    labelEncoded = LabelEncoder().fit_transform(GENRES)
    labelEncoded = labelEncoded.reshape(len(labelEncoded), 1)
    oneHotEncoder = OneHotEncoder(sparse=False)
    oneHotEncoded = oneHotEncoder.fit_transform(labelEncoded)

    for i, genre in enumerate(GENRES):
        genresDict[genre] = np.array(oneHotEncoded[i])

    trackIDs = getTrackIDs(AUDIO_DIR, tracks)
    # text file made to do some quality control test on excel
    np.savetxt(MAIN_DIR + "trackIDs.csv", trackIDs, delimiter=",", fmt='%s')

    if not os.path.exists(PICKLE_DIR):
        try:
            os.makedirs(PICKLE_DIR)
        except:
            pass