def load_data(self, dictionary, train_split=0.7):
        self.data = two_neighbour_data.get_training_data(dictionary, self.char, self.pinyins, self.word_vec, self.n)
        # split the data into train and test, each class at the same ratio
        train_x_set = []
        train_y_set = []
        test_x_set = []
        test_y_set = []
        train_raw_sentences = []
        test_raw_sentences = []
        for cate_index, pinyin in enumerate(self.pinyins):
            cate_total_count = self.data['y_count'][pinyin]
            train_size = int(train_split * cate_total_count)
            cate_all_indexes = (self.data['y'] == utils.index_to_one_hot(cate_index, self.cate_count)).all(axis=1)
            train_x_set.append(self.data['x'][cate_all_indexes, :][0 : train_size])
            train_y_set.append(self.data['y'][cate_all_indexes, :][0 : train_size])
            test_x_set.append(self.data['x'][cate_all_indexes, :][train_size : ])
            test_y_set.append(self.data['y'][cate_all_indexes, :][train_size: ])
            train_raw_sentences.append(self.data['x_sentences'][cate_all_indexes][0 : train_size])
            test_raw_sentences.append(self.data['x_sentences'][cate_all_indexes][train_size: ])

        self.train_x = np.vstack(train_x_set)
        self.train_y = np.vstack(train_y_set)
        self.test_x = np.vstack(test_x_set)
        self.test_y = np.vstack(test_y_set)
        self.train_raw_sentences = np.hstack(train_raw_sentences)
        self.test_raw_sentences = np.hstack(test_raw_sentences)
示例#2
0
 def read_labels(path_to_labels):
     """
     :param path_to_labels: path to the binary file containing labels from the STL-10 dataset
     :return: an array containing the labels
     """
     with open(path_to_labels, 'rb') as f:
         labels = np.fromfile(f, dtype=np.uint8)
         return utils.index_to_one_hot(
             labels - 1, 10)  # convert index labels [1, 10] to one-hot
示例#3
0
文件: Batch.py 项目: jkjan/NLP
    def get_batch(self):
        input = torch.Tensor()
        output = torch.Tensor()

        for i in range(0, batch_size):
            if self.word_in_sentence == len(tokenized[self.sentence]):
                self.sentence += 1
                self.word_in_sentence = 0

            index = vocabulary[tokenized[self.sentence][self.word_in_sentence]]
            adding = index_to_one_hot(index)
            input = torch.cat([input, adding], 0)

            get_label(tokenized[self.sentence], self.word_in_sentence)
def create_examples(input_path, data_file, example_length):
    """
    Loads all digraph data in input_path, creates examples of length example_length
    and corresponding labels.

    Returns:
    Matrix X of shape (#examples, example_length, feature_length)
    Matrix y of shape (#examples, #users)
    """

    n_users = len(os.listdir(input_path))

    X = []  # np.empty((0, example_length, FEATURE_LENGTH))
    y = []  # np.empty((0, 1))

    print("Generating examples...")
    for i, user_file_name in tqdm(enumerate(os.listdir(input_path))):
        if user_file_name[0] == ".":
            continue
        with open(input_path + user_file_name, "r") as user_file:
            example = []
            for line in user_file:
                feature = tuple(map(int, line.split()))
                example.append(feature)

                if len(example) == example_length:
                    X.append(np.asarray(example))
                    y.append(i)
                    example = []

    X = np.asarray(X)
    y = np.asarray(y)

    y = utils.index_to_one_hot(y, n_users)

    data_file.create_dataset("X_plain",
                             data=X,
                             maxshape=(None, example_length, FEATURE_LENGTH),
                             dtype=float)
    data_file.create_dataset("y_plain",
                             data=y,
                             maxshape=(None, n_users),
                             dtype=float)

    return X, y
示例#5
0
def read_CIFAR_100(cifar_path, train=True):
    """

    :param cifar_path: data path for cifar-100
    :param train: check if its the train mode
    :return: data and its label

    Note:
        data (#samples, 32, 32, 3)
        labels (#samples, 100)
    """

    data = []
    labels = []

    if train:  # If reading train set

        file_name = cifar_path + "train"
        data_dict = utils.unpickle(file_name)
        batch_data = data_dict[b"data"]
        batch_labels = data_dict[b'fine_labels']
        data.append(batch_data)
        labels.append(batch_labels)

    else:  # If reading test set
        file_name = cifar_path + "test"
        data_dict = utils.unpickle(file_name)
        batch_data = data_dict[b"data"]
        batch_labels = data_dict[b"fine_labels"]
        data.append(batch_data)
        labels.append(batch_labels)

    data = np.asarray(data)
    data = np.reshape(data,
                      (data.shape[0] * data.shape[1], 3, 32, 32)).transpose(
                          0, 2, 3, 1)

    labels = np.asarray(labels)
    labels = np.reshape(labels, (labels.shape[0] * labels.shape[1], )).tolist()
    labels = utils.index_to_one_hot(labels, 100)

    return data, labels
示例#6
0
def read_CIFAR_10(cifar_path, train=True):
    """
    Assumes the raw CIFAR-10 data is located in cifar_path,
    reads the dataset, and returns it as numpy arrays:

    data (#samples, 32, 32, 3)
    labels (#samples, 10)

    The boolean argument train determines whether the train or test set is read.
    """

    data = []
    labels = []

    if train:  # If reading train set
        for i in range(1, 6):
            file_name = cifar_path + "data_batch_" + str(i)
            data_dict = utils.unpickle(file_name)
            batch_data = data_dict[b"data"]
            batch_labels = data_dict[b"labels"]
            data.append(batch_data)
            labels.append(batch_labels)

    else:  # If reading test set
        file_name = cifar_path + "test_batch"
        data_dict = utils.unpickle(file_name)
        batch_data = data_dict[b"data"]
        batch_labels = data_dict[b"labels"]
        data.append(batch_data)
        labels.append(batch_labels)

    data = np.asarray(data)
    data = np.reshape(data,
                      (data.shape[0] * data.shape[1], 3, 32, 32)).transpose(
                          0, 2, 3, 1)

    labels = np.asarray(labels)
    labels = np.reshape(labels, (labels.shape[0] * labels.shape[1], )).tolist()
    labels = utils.index_to_one_hot(labels, 10)

    return data, labels