Exemplo n.º 1
0
    def __create_xy_train(self,
                          tag_file,
                          embedding_file,
                          data_size=1,
                          look_back=5,
                          threshold=0,
                          suffix=None):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = int(len(words) * min(data_size, 1)) - int(
            len(words) * min(data_size, 1)) % look_back
        data_size = 53750

        for idx in np.arange(0, data_size, look_back):
            dict_tag_inputs = [tag_dict[words[idx]]]

            word_inputs = [
                word_emb[word_keys[idx]]
            ] if word_keys[idx] in word_emb else [word_emb["UNK"]]
            for widx in range(1, look_back):
                word_inputs = np.append(
                    word_inputs, [word_emb[word_keys[idx + widx]]] if
                    word_keys[idx + widx] in word_emb else [word_emb["UNK"]],
                    axis=0)
                dict_tag_inputs.append(tag_dict[words[idx + widx]])

            dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs))
            for jdx in range(len(dict_tag_inputs)):
                tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]]
                if idx == 0 and jdx == 0:
                    x_train = [word_inputs]
                    y_train = [tag_inputs]
                else:
                    x_train = np.append(x_train, [word_inputs], axis=0)
                    y_train = np.append(y_train, [tag_inputs], axis=0)

            if idx % int(data_size / (10 * look_back)) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train
Exemplo n.º 2
0
    def __create_xy_train(self, tag_file, embedding_file, data_size, window_size, threshold, suffix):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = min((int(len(words)*data_size), len(words)-window_size))

        for idx in range(data_size):
            word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"]
            for widx in range(1, window_size):
                word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0)

            tag_inputs = [tag_emb[tag] for tag in tag_dict[words[idx+int(window_size/2)]]]

            for tidx in range(len(tag_inputs)):
                tag_input = tag_inputs[tidx]
                if idx == 0 and tidx == 0:
                    x_train = [word_input]
                    y_train = [tag_input]
                else:
                    x_train = np.append(x_train, [word_input], axis=0)
                    y_train = np.append(y_train, [tag_input], axis=0)

            if idx%int(data_size/100) == 0:
                DataUtils.update_message(str(int(idx/data_size*100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train