예제 #1
0
    def __create_xy_train(self,
                          tag_file,
                          embedding_file,
                          data_size=1,
                          look_back=5,
                          threshold=0,
                          suffix=None):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = int(len(words) * min(data_size, 1)) - int(
            len(words) * min(data_size, 1)) % look_back
        data_size = 53750

        for idx in np.arange(0, data_size, look_back):
            dict_tag_inputs = [tag_dict[words[idx]]]

            word_inputs = [
                word_emb[word_keys[idx]]
            ] if word_keys[idx] in word_emb else [word_emb["UNK"]]
            for widx in range(1, look_back):
                word_inputs = np.append(
                    word_inputs, [word_emb[word_keys[idx + widx]]] if
                    word_keys[idx + widx] in word_emb else [word_emb["UNK"]],
                    axis=0)
                dict_tag_inputs.append(tag_dict[words[idx + widx]])

            dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs))
            for jdx in range(len(dict_tag_inputs)):
                tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]]
                if idx == 0 and jdx == 0:
                    x_train = [word_inputs]
                    y_train = [tag_inputs]
                else:
                    x_train = np.append(x_train, [word_inputs], axis=0)
                    y_train = np.append(y_train, [tag_inputs], axis=0)

            if idx % int(data_size / (10 * look_back)) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train
예제 #2
0
    def __create_xy_test(self,
                         tag_file,
                         embedding_file,
                         data_size=1,
                         look_back=5,
                         suffix=None):
        x_test = []
        y_test = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words, tags = DataUtils.extract_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)

        data_size = int(len(words) * min(data_size, 1)) - int(
            len(words) * min(data_size, 1)) % look_back

        for idx in np.arange(0, data_size, look_back):
            x_timestep = []
            y_timestep = []

            for jdx in range(look_back):
                word_input = word_emb[word_keys[idx + jdx]] if word_keys[
                    idx + jdx] in word_emb else word_emb["UNK"]
                tag_input = tag_emb[tags[idx + jdx]]

                if (jdx == 0):
                    x_timestep = [word_input]
                    y_timestep = [tag_input]
                else:
                    x_timestep = np.append(x_timestep, [word_input], axis=0)
                    y_timestep = np.append(y_timestep, [tag_input], axis=0)

                x_timestep = np.array(x_timestep)
                y_timestep = np.array(y_timestep)

            if (idx == 0):
                x_test = [x_timestep]
                y_test = [y_timestep]
            else:
                x_test = np.append(x_test, [x_timestep], axis=0)
                y_test = np.append(y_test, [y_timestep], axis=0)

            if idx % int(data_size / (10 * look_back)) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))

        x_test = np.array(x_test)
        y_test = np.array(y_test)

        return x_test, y_test
예제 #3
0
    def __create_xy(self, tag_file, embedding_file, data_size, window_size,
                    available_tags, suffix):
        x = []
        y = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(
            DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0],
                                                       suffix[1])

        words, tags = DataUtils.extract_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)

        data_size = int(len(words) * data_size)

        for idx in range(data_size):
            tag = tags[idx + int(window_size / 2)]
            if len(available_tags) == 0 or tag in available_tags:
                word_input = word_emb[word_keys[idx]] if word_keys[
                    idx] in word_emb else word_emb["UNK"]
                for widx in range(1, window_size):
                    word_input = np.append(
                        word_input,
                        word_emb[word_keys[idx + widx]] if
                        word_keys[idx + widx] in word_emb else word_emb["UNK"],
                        axis=0)

                tag_input = tag_emb[tag]

                if (idx == 0):
                    x = [word_input]
                    y = [tag_input]
                else:
                    x = np.append(x, [word_input], axis=0)
                    y = np.append(y, [tag_input], axis=0)

            if idx % int(data_size / 10) == 0:
                DataUtils.update_message(str(int(idx / data_size * 100)))
        return x, y
예제 #4
0
    def __create_xy_train(self, tag_file, embedding_file, data_size, window_size, threshold, suffix):
        x_train = []
        y_train = []

        corpus = DataUtils.load_corpus(tag_file)
        tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus))
        word_emb = DataUtils.load_embeddings(embedding_file)
        if suffix is not None:
            word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1])

        words = DataUtils.extract_word_data(corpus)
        word_keys = DataUtils.normalize_cases(word_emb.keys(), words)
        tag_dict = DataUtils.extract_tag_dict(corpus, threshold)

        data_size = min((int(len(words)*data_size), len(words)-window_size))

        for idx in range(data_size):
            word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"]
            for widx in range(1, window_size):
                word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0)

            tag_inputs = [tag_emb[tag] for tag in tag_dict[words[idx+int(window_size/2)]]]

            for tidx in range(len(tag_inputs)):
                tag_input = tag_inputs[tidx]
                if idx == 0 and tidx == 0:
                    x_train = [word_input]
                    y_train = [tag_input]
                else:
                    x_train = np.append(x_train, [word_input], axis=0)
                    y_train = np.append(y_train, [tag_input], axis=0)

            if idx%int(data_size/100) == 0:
                DataUtils.update_message(str(int(idx/data_size*100)))

        x_train = np.array(x_train)
        y_train = np.array(y_train)

        return x_train, y_train
예제 #5
0
    def __create_xy(self, dependency_tree, embedding_file, data_size, look_back, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(dependency_tree)
        word_vectors = DataUtils.create_onehot_vectors(words)
        #word_int = DataUtils.create_int_dict(words)
        word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences)*min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_input_forward = []
        word_input_backward = []
        word_head_forward = []
        word_head_backward = []

        tag_input_forward = []
        tag_input_backward = []
        tag_head_forward = []
        tag_head_backward = []

        probability = []

        progress = 0

        for sentence in sentences[:data_size]:
            parts = [sentence[i:i+look_back] for i in range(0,len(sentence),look_back)]
            for part in parts:
                word_temp = np.zeros((2,look_back,300))
                tag_temp = np.zeros((2,look_back,),dtype="int32")
                prob_temp = np.zeros((look_back,),dtype="float32")

                for idx in range(len(part)):
                    word = part[idx]["word"]
                    word_temp[0][look_back-len(part)+idx] = word_emb[word] if word in word_emb else word_emb["UNK"]
                    word_temp[1][look_back-idx-1] = word_emb[word] if word in word_emb else word_emb["UNK"]

                    tag = part[idx]["tag"]
                    tag_temp[0][look_back-len(part)+idx] = tag_int[tag]
                    tag_temp[1][look_back-idx-1] = tag_int[tag]

                word_instance = np.zeros((len(part),2,look_back,300))
                tag_instance = np.zeros((len(part),2,look_back,),dtype="int32")

                head_instance = np.zeros((look_back,1), dtype="float32")

                for idx in range(len(part)):
                    word_instance[idx][0][look_back-idx-1:] = word_temp[0][look_back-len(part):look_back-len(part)+idx+1]
                    word_instance[idx][1][look_back-len(part)+idx:] = word_temp[1][look_back-len(part):look_back-idx]

                    tag_instance[idx][0][look_back-idx-1:] = tag_temp[0][look_back-len(part):look_back-len(part)+idx+1]
                    tag_instance[idx][1][look_back-len(part)+idx:] = tag_temp[1][look_back-len(part):look_back-idx]

                for idx in range(len(part)):
                    word_input = np.zeros((2,2,look_back,300))
                    tag_input = np.zeros((2,2,look_back,),dtype="int32")
                    prob_temp = 0.0

                    for jdx in range(len(part)):
                        if idx != jdx:
                            if part[idx]["head"] == part[jdx]["word"]:
                                prob_temp = 1.0
                            word_input[0] = word_instance[idx]
                            tag_input[0] = tag_instance[idx]
                            word_input[1] = word_instance[jdx]
                            tag_input[1] = tag_instance[jdx]

                    if len(word_input_forward) == 0:
                        word_input_forward = [word_input[0][0]]
                        word_input_backward = [word_input[0][1]]
                        word_head_forward = [word_instance[1][0]]
                        word_head_backward = [word_instance[1][1]]

                        tag_input_forward = [tag_input[0][0]]
                        tag_input_backward = [tag_input[0][1]]
                        tag_head_forward = [tag_input[1][0]]
                        tag_head_backward = [tag_input[1][1]]

                        probability = [prob_temp]
                    else:
                        word_input_forward = np.append(word_input_forward,[word_input[0][0]], axis=0)
                        word_input_backward = np.append(word_input_backward,[word_input[0][1]], axis=0)
                        word_head_forward = np.append(word_head_forward,[word_instance[1][0]], axis=0)
                        word_head_backward = np.append(word_head_backward,[word_instance[1][1]], axis=0)

                        tag_input_forward = np.append(tag_input_forward,[tag_input[0][0]], axis=0)
                        tag_input_backward = np.append(tag_input_backward,[tag_input[0][1]], axis=0)
                        tag_head_forward = np.append(tag_head_forward,[tag_input[1][0]], axis=0)
                        tag_head_backward = np.append(tag_head_backward,[tag_input[1][1]], axis=0)

                        probability = np.append(probability, [prob_temp], axis=0)

            DataUtils.update_message(str(progress)+"/"+str(data_size))
            progress += 1

        word_data = [(word_input_forward, word_input_backward), (word_head_forward, word_head_backward)]
        tag_data = [(tag_input_forward, tag_input_backward), (tag_head_forward, tag_head_backward)]

        return word_data, tag_data, probability
예제 #6
0
    def __create_xy(self, embedding_file, data_size, look_back, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(self.language)
        word_vectors = DataUtils.create_onehot_vectors(words)
        #word_int = DataUtils.create_int_dict(words)
        word_emb = None
        if self.language == "turkish":
            word_emb = DataUtils.load_embeddings(embedding_file, "fasttext")
        else:
            word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences) * min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_full_forward = []
        word_full_backward = []
        word_instance_forward = []
        word_instance_backward = []

        tag_full_forward = []
        tag_full_backward = []
        tag_instance_forward = []
        tag_instance_backward = []

        head = []

        progress = 0

        for sentence in sentences[:data_size]:
            parts = [
                sentence[i:i + look_back]
                for i in range(0, len(sentence), look_back)
            ]
            for part in parts:
                word_temp = np.zeros((2, look_back, 300))
                tag_temp = np.zeros((
                    2,
                    look_back,
                ), dtype="int32")

                head_instance = np.zeros((look_back, 1), dtype="float32")

                for idx in range(len(part)):
                    word = part[idx]["word"]
                    word_temp[0][look_back - len(part) + idx] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]
                    word_temp[1][look_back - idx - 1] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]

                    tag = part[idx]["tag"]
                    tag_temp[0][look_back - len(part) + idx] = tag_int[tag]
                    tag_temp[1][look_back - idx - 1] = tag_int[tag]

                    word_instance = np.zeros((2, look_back, 300))
                    tag_instance = np.zeros((
                        2,
                        look_back,
                    ), dtype="int32")

                    for jdx in range(len(part)):
                        word_instance[0][look_back - jdx - 1:] = word_temp[
                            0][look_back - len(part):look_back - len(part) +
                               jdx + 1]
                        word_instance[1][look_back - len(part) +
                                         jdx:] = word_temp[1][look_back -
                                                              len(part
                                                                  ):look_back -
                                                              jdx]

                        tag_instance[0][look_back - jdx -
                                        1:] = tag_temp[0][look_back -
                                                          len(part):look_back -
                                                          len(part) + jdx + 1]
                        tag_instance[1][look_back - len(part) +
                                        jdx:] = tag_temp[1][look_back -
                                                            len(part
                                                                ):look_back -
                                                            jdx]

                        head_instance = np.zeros((look_back, 1),
                                                 dtype="float32")

                        for zdx in range(len(part)):
                            head_instance[zdx] = 1 if part[jdx][
                                "head"] == part[zdx]["word"] else 0
                        if len(word_full_forward) == 0:
                            word_full_forward = [word_temp[0]]
                            word_full_backward = [word_temp[1]]
                            word_instance_forward = [word_instance[0]]
                            word_instance_backward = [word_instance[1]]

                            tag_full_forward = [tag_temp[0]]
                            tag_full_backward = [tag_temp[1]]
                            tag_instance_forward = [tag_instance[0]]
                            tag_instance_backward = [tag_instance[1]]

                            head = [head_instance]
                        else:
                            word_full_forward = np.append(word_full_forward,
                                                          [word_temp[0]],
                                                          axis=0)
                            word_full_backward = np.append(word_full_backward,
                                                           [word_temp[1]],
                                                           axis=0)
                            word_instance_forward = np.append(
                                word_instance_forward, [word_instance[0]],
                                axis=0)
                            word_instance_backward = np.append(
                                word_instance_backward, [word_instance[1]],
                                axis=0)

                            tag_full_forward = np.append(tag_full_forward,
                                                         [tag_temp[0]],
                                                         axis=0)
                            tag_full_backward = np.append(tag_full_backward,
                                                          [tag_temp[1]],
                                                          axis=0)
                            tag_instance_forward = np.append(
                                tag_instance_forward, [tag_instance[0]],
                                axis=0)
                            tag_instance_backward = np.append(
                                tag_instance_backward, [tag_instance[1]],
                                axis=0)

                            head = np.append(head, [head_instance], axis=0)

            DataUtils.update_message(str(progress) + "/" + str(data_size))
            progress += 1

        word_data = [(word_full_forward, word_full_backward),
                     (word_instance_forward, word_instance_backward)]
        tag_data = [(tag_full_forward, tag_full_backward),
                    (tag_instance_forward, tag_instance_backward)]

        print(word_full_forward.shape, word_instance_forward.shape, head.shape)

        return word_data, tag_data, head
예제 #7
0
    def __create_xy(self, parse_tree_file, data_size, seq_len, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(
            parse_tree_file)
        word_int = DataUtils.create_int_dict(words)
        tag_int = DataUtils.create_onehot_vectors(tags)

        self.seq_len = seq_len
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        data_len = 0
        for i in range(len(sentences)):
            data_len += int(np.ceil(
                len(sentences[i]) / seq_len)) * seq_len * seq_len

        forward = np.zeros((
            2,
            data_len,
            seq_len,
        ), dtype="int32")
        backward = np.zeros((
            2,
            data_len,
            seq_len,
        ), dtype="int32")
        probability = np.zeros((data_len, ), dtype="float32")
        tags = np.zeros((data_len, 18))

        idx = 0
        for sentence in sentences:
            parts = [
                sentence[i:i + seq_len]
                for i in range(0, len(sentence), seq_len)
            ]
            for part in parts:
                part_len = len(part)
                word_forward = np.zeros((seq_len, seq_len), dtype="int32")
                word_backward = np.zeros((seq_len, seq_len), dtype="int32")

                for jdx in range(part_len):
                    word_forward[jdx][seq_len - jdx - 1:] = [
                        word_int[part[i]["word"]] for i in range(jdx + 1)
                    ]
                    word_backward[jdx][seq_len - part_len + jdx:] = [
                        word_int[part[part_len - i - 1]["word"]]
                        for i in range(part_len - jdx)
                    ]

                for jdx in range(part_len):
                    for zdx in range(part_len):
                        tags[idx] = tag_int[part[jdx]["tag"]]
                        forward[0][idx] = word_forward[jdx]
                        forward[1][idx] = word_forward[zdx]
                        backward[0][idx] = word_backward[jdx]
                        backward[1][idx] = word_backward[zdx]
                        probability[idx] = 1.0 if part[jdx]["head"] == part[
                            zdx]["word"] else 0.0
                        idx += 1

                        if idx % int(data_len / 100) == 0:
                            DataUtils.update_message(
                                str(int(idx / data_len * 100)))
        if test:
            forward = [
                np.array(forward[0][5000:10000]),
                np.array(forward[1][5000:10000])
            ]
            backward = [
                np.array(backward[0][5000:10000]),
                np.array(backward[1][5000:10000])
            ]
            probability = np.array(probability[5000:10000])
            tags = np.array(tags[5000:10000])
        else:
            forward = [
                np.array(forward[0][:5000]),
                np.array(forward[1][:5000])
            ]
            backward = [
                np.array(backward[0][:5000]),
                np.array(backward[1][:5000])
            ]
            probability = np.array(probability[:5000])
            tags = np.array(tags[:5000])

        return [forward[0], backward[0], forward[1],
                backward[1]], [tags, probability]
예제 #8
0
    def __create_xy(self,
                    dependency_tree,
                    embedding_file,
                    data_size,
                    look_back,
                    test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(
            dependency_tree)
        word_vectors = DataUtils.create_onehot_vectors(words)
        word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences) * min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_data = []
        head_data = []
        tag_data = []

        progress = 0
        for sentence in sentences[:data_size]:
            word_timestep = np.zeros((look_back, 300))
            head_timestep = np.zeros((look_back, len(words)))
            tag_timestep = np.zeros((look_back, ), dtype="int32")

            timestep = 0
            for element in sentence:
                word = element["word"]

                if word != "ROOT":
                    word_timestep[timestep % look_back] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]

                    head = element["head"]
                    head_timestep[timestep % look_back] = word_vectors[head]

                    tag = element["tag"]
                    tag_timestep[timestep % look_back] = tag_int[tag]

                timestep += 1

                if timestep % look_back == 0 or timestep == len(sentence):
                    if len(word_data) == 0:
                        word_data = [word_timestep]
                        head_data = [head_timestep]
                        tag_data = [tag_timestep]
                    else:
                        word_data = np.append(word_data, [word_timestep],
                                              axis=0)
                        head_data = np.append(head_data, [head_timestep],
                                              axis=0)
                        tag_data = np.append(tag_data, [tag_timestep], axis=0)

                    word_timestep.fill(0)
                    head_timestep.fill(0)
                    tag_timestep.fill(0)

            DataUtils.update_message(str(progress) + "/" + str(data_size))
            progress += 1

        word_data = np.array(word_data)
        head_data = np.array(head_data)
        tag_data = np.array(tag_data)

        return word_data, head_data, tag_data