Пример #1
0
    def __create_xy_train(self,
                          tag_file,
                          embedding_file,
                          data_size=1,
                          look_back=5,
                          threshold=0,
                          suffix=None):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = int(len(words) * min(data_size, 1)) - int(
            len(words) * min(data_size, 1)) % look_back
        data_size = 53750

        for idx in np.arange(0, data_size, look_back):
            dict_tag_inputs = [tag_dict[words[idx]]]

            word_inputs = [
                word_emb[word_keys[idx]]
            ] if word_keys[idx] in word_emb else [word_emb["UNK"]]
            for widx in range(1, look_back):
                word_inputs = np.append(
                    word_inputs, [word_emb[word_keys[idx + widx]]] if
                    word_keys[idx + widx] in word_emb else [word_emb["UNK"]],
                    axis=0)
                dict_tag_inputs.append(tag_dict[words[idx + widx]])

            dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs))
            for jdx in range(len(dict_tag_inputs)):
                tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]]
                if idx == 0 and jdx == 0:
                    x_train = [word_inputs]
                    y_train = [tag_inputs]
                else:
                    x_train = np.append(x_train, [word_inputs], axis=0)
                    y_train = np.append(y_train, [tag_inputs], axis=0)

            if idx % int(data_size / (10 * look_back)) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train
Пример #2
0
    def __create_xy_test(self,
                         tag_file,
                         embedding_file,
                         data_size=1,
                         look_back=5,
                         suffix=None):
        x_test = []
        y_test = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words, tags = DataUtils.extract_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)

        data_size = int(len(words) * min(data_size, 1)) - int(
            len(words) * min(data_size, 1)) % look_back

        for idx in np.arange(0, data_size, look_back):
            x_timestep = []
            y_timestep = []

            for jdx in range(look_back):
                word_input = word_emb[word_keys[idx + jdx]] if word_keys[
                    idx + jdx] in word_emb else word_emb["UNK"]
                tag_input = tag_emb[tags[idx + jdx]]

                if (jdx == 0):
                    x_timestep = [word_input]
                    y_timestep = [tag_input]
                else:
                    x_timestep = np.append(x_timestep, [word_input], axis=0)
                    y_timestep = np.append(y_timestep, [tag_input], axis=0)

                x_timestep = np.array(x_timestep)
                y_timestep = np.array(y_timestep)

            if (idx == 0):
                x_test = [x_timestep]
                y_test = [y_timestep]
            else:
                x_test = np.append(x_test, [x_timestep], axis=0)
                y_test = np.append(y_test, [y_timestep], axis=0)

            if idx % int(data_size / (10 * look_back)) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))

        x_test = np.array(x_test)
        y_test = np.array(y_test)

        return x_test, y_test
Пример #3
0
    def __create_xy(self, tag_file, embedding_file, data_size, window_size,
                    available_tags, suffix):
        x = []
        y = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words, tags = DataUtils.extract_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)

        data_size = int(len(words) * data_size)

        for idx in range(data_size):
            tag = tags[idx + int(window_size / 2)]
            if len(available_tags) == 0 or tag in available_tags:
                word_input = word_emb[word_keys[idx]] if word_keys[
                    idx] in word_emb else word_emb["UNK"]
                for widx in range(1, window_size):
                    word_input = np.append(
                        word_input,
                        word_emb[word_keys[idx + widx]] if
                        word_keys[idx + widx] in word_emb else word_emb["UNK"],
                        axis=0)

                tag_input = tag_emb[tag]

                if (idx == 0):
                    x = [word_input]
                    y = [tag_input]
                else:
                    x = np.append(x, [word_input], axis=0)
                    y = np.append(y, [tag_input], axis=0)

            if idx % int(data_size / 10) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))
        return x, y
Пример #4
0
    def __create_xy_train(self, tag_file, embedding_file, data_size, window_size, threshold, suffix):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = min((int(len(words)*data_size), len(words)-window_size))

        for idx in range(data_size):
            word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"]
            for widx in range(1, window_size):
                word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0)

            tag_inputs = [tag_emb[tag] for tag in tag_dict[words[idx+int(window_size/2)]]]

            for tidx in range(len(tag_inputs)):
                tag_input = tag_inputs[tidx]
                if idx == 0 and tidx == 0:
                    x_train = [word_input]
                    y_train = [tag_input]
                else:
                    x_train = np.append(x_train, [word_input], axis=0)
                    y_train = np.append(y_train, [tag_input], axis=0)

            if idx%int(data_size/100) == 0:
                DataUtils.update_message(str(int(idx/data_size*100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train