def __create_xy_train(self, tag_file, embedding_file, data_size=1, look_back=5, threshold=0, suffix=None): x_train = [] y_train = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words = DataUtils.extract_word_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) tag_dict = DataUtils.extract_tag_dict(corpus, threshold) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back data_size = 53750 for idx in np.arange(0, data_size, look_back): dict_tag_inputs = [tag_dict[words[idx]]] word_inputs = [ word_emb[word_keys[idx]] ] if word_keys[idx] in word_emb else [word_emb["UNK"]] for widx in range(1, look_back): word_inputs = np.append( word_inputs, [word_emb[word_keys[idx + widx]]] if word_keys[idx + widx] in word_emb else [word_emb["UNK"]], axis=0) dict_tag_inputs.append(tag_dict[words[idx + widx]]) dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs)) for jdx in range(len(dict_tag_inputs)): tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]] if idx == 0 and jdx == 0: x_train = [word_inputs] y_train = [tag_inputs] else: x_train = np.append(x_train, [word_inputs], axis=0) y_train = np.append(y_train, [tag_inputs], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_train = np.array(x_train) y_train = np.array(y_train) return x_train, y_train
def __create_xy_test(self, tag_file, embedding_file, data_size=1, look_back=5, suffix=None): x_test = [] y_test = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back for idx in np.arange(0, data_size, look_back): x_timestep = [] y_timestep = [] for jdx in range(look_back): word_input = word_emb[word_keys[idx + jdx]] if word_keys[ idx + jdx] in word_emb else word_emb["UNK"] tag_input = tag_emb[tags[idx + jdx]] if (jdx == 0): x_timestep = [word_input] y_timestep = [tag_input] else: x_timestep = np.append(x_timestep, [word_input], axis=0) y_timestep = np.append(y_timestep, [tag_input], axis=0) x_timestep = np.array(x_timestep) y_timestep = np.array(y_timestep) if (idx == 0): x_test = [x_timestep] y_test = [y_timestep] else: x_test = np.append(x_test, [x_timestep], axis=0) y_test = np.append(y_test, [y_timestep], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_test = np.array(x_test) y_test = np.array(y_test) return x_test, y_test
def __create_xy(self, tag_file, embedding_file, data_size, window_size, available_tags, suffix): x = [] y = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * data_size) for idx in range(data_size): tag = tags[idx + int(window_size / 2)] if len(available_tags) == 0 or tag in available_tags: word_input = word_emb[word_keys[idx]] if word_keys[ idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append( word_input, word_emb[word_keys[idx + widx]] if word_keys[idx + widx] in word_emb else word_emb["UNK"], axis=0) tag_input = tag_emb[tag] if (idx == 0): x = [word_input] y = [tag_input] else: x = np.append(x, [word_input], axis=0) y = np.append(y, [tag_input], axis=0) if idx % int(data_size / 10) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) return x, y
def __create_xy_train(self, tag_file, embedding_file, data_size, window_size, threshold, suffix): x_train = [] y_train = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words = DataUtils.extract_word_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) tag_dict = DataUtils.extract_tag_dict(corpus, threshold) data_size = min((int(len(words)*data_size), len(words)-window_size)) for idx in range(data_size): word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0) tag_inputs = [tag_emb[tag] for tag in tag_dict[words[idx+int(window_size/2)]]] for tidx in range(len(tag_inputs)): tag_input = tag_inputs[tidx] if idx == 0 and tidx == 0: x_train = [word_input] y_train = [tag_input] else: x_train = np.append(x_train, [word_input], axis=0) y_train = np.append(y_train, [tag_input], axis=0) if idx%int(data_size/100) == 0: DataUtils.update_message(str(int(idx/data_size*100))) x_train = np.array(x_train) y_train = np.array(y_train) return x_train, y_train
def __create_xy(self, dependency_tree, embedding_file, data_size, look_back, test=False): sentences, words, tags = DataUtils.parse_dependency_tree(dependency_tree) word_vectors = DataUtils.create_onehot_vectors(words) #word_int = DataUtils.create_int_dict(words) word_emb = DataUtils.load_embeddings(embedding_file) tag_int = DataUtils.create_int_dict(tags) data_size = int(len(sentences)*min(data_size, 1)) if test: sentences.reverse() if look_back == 0: for sentence in sentences[:data_size]: look_back = max(look_back, len(sentence)) self.look_back = look_back self.distinct_words = len(words) self.distinct_tags = len(tags) word_input_forward = [] word_input_backward = [] word_head_forward = [] word_head_backward = [] tag_input_forward = [] tag_input_backward = [] tag_head_forward = [] tag_head_backward = [] probability = [] progress = 0 for sentence in sentences[:data_size]: parts = [sentence[i:i+look_back] for i in range(0,len(sentence),look_back)] for part in parts: word_temp = np.zeros((2,look_back,300)) tag_temp = np.zeros((2,look_back,),dtype="int32") prob_temp = np.zeros((look_back,),dtype="float32") for idx in range(len(part)): word = part[idx]["word"] word_temp[0][look_back-len(part)+idx] = word_emb[word] if word in word_emb else word_emb["UNK"] word_temp[1][look_back-idx-1] = word_emb[word] if word in word_emb else word_emb["UNK"] tag = part[idx]["tag"] tag_temp[0][look_back-len(part)+idx] = tag_int[tag] tag_temp[1][look_back-idx-1] = tag_int[tag] word_instance = np.zeros((len(part),2,look_back,300)) tag_instance = np.zeros((len(part),2,look_back,),dtype="int32") head_instance = np.zeros((look_back,1), dtype="float32") for idx in range(len(part)): word_instance[idx][0][look_back-idx-1:] = word_temp[0][look_back-len(part):look_back-len(part)+idx+1] word_instance[idx][1][look_back-len(part)+idx:] = word_temp[1][look_back-len(part):look_back-idx] tag_instance[idx][0][look_back-idx-1:] = tag_temp[0][look_back-len(part):look_back-len(part)+idx+1] tag_instance[idx][1][look_back-len(part)+idx:] = tag_temp[1][look_back-len(part):look_back-idx] for idx in range(len(part)): word_input = np.zeros((2,2,look_back,300)) tag_input = np.zeros((2,2,look_back,),dtype="int32") prob_temp = 0.0 for jdx in range(len(part)): if idx != jdx: if part[idx]["head"] == part[jdx]["word"]: prob_temp = 1.0 word_input[0] = word_instance[idx] tag_input[0] = tag_instance[idx] word_input[1] = word_instance[jdx] tag_input[1] = tag_instance[jdx] if len(word_input_forward) == 0: word_input_forward = [word_input[0][0]] word_input_backward = [word_input[0][1]] word_head_forward = [word_instance[1][0]] word_head_backward = [word_instance[1][1]] tag_input_forward = [tag_input[0][0]] tag_input_backward = [tag_input[0][1]] tag_head_forward = [tag_input[1][0]] tag_head_backward = [tag_input[1][1]] probability = [prob_temp] else: word_input_forward = np.append(word_input_forward,[word_input[0][0]], axis=0) word_input_backward = np.append(word_input_backward,[word_input[0][1]], axis=0) word_head_forward = np.append(word_head_forward,[word_instance[1][0]], axis=0) word_head_backward = np.append(word_head_backward,[word_instance[1][1]], axis=0) tag_input_forward = np.append(tag_input_forward,[tag_input[0][0]], axis=0) tag_input_backward = np.append(tag_input_backward,[tag_input[0][1]], axis=0) tag_head_forward = np.append(tag_head_forward,[tag_input[1][0]], axis=0) tag_head_backward = np.append(tag_head_backward,[tag_input[1][1]], axis=0) probability = np.append(probability, [prob_temp], axis=0) DataUtils.update_message(str(progress)+"/"+str(data_size)) progress += 1 word_data = [(word_input_forward, word_input_backward), (word_head_forward, word_head_backward)] tag_data = [(tag_input_forward, tag_input_backward), (tag_head_forward, tag_head_backward)] return word_data, tag_data, probability
def __create_xy(self, embedding_file, data_size, look_back, test=False): sentences, words, tags = DataUtils.parse_dependency_tree(self.language) word_vectors = DataUtils.create_onehot_vectors(words) #word_int = DataUtils.create_int_dict(words) word_emb = None if self.language == "turkish": word_emb = DataUtils.load_embeddings(embedding_file, "fasttext") else: word_emb = DataUtils.load_embeddings(embedding_file) tag_int = DataUtils.create_int_dict(tags) data_size = int(len(sentences) * min(data_size, 1)) if test: sentences.reverse() if look_back == 0: for sentence in sentences[:data_size]: look_back = max(look_back, len(sentence)) self.look_back = look_back self.distinct_words = len(words) self.distinct_tags = len(tags) word_full_forward = [] word_full_backward = [] word_instance_forward = [] word_instance_backward = [] tag_full_forward = [] tag_full_backward = [] tag_instance_forward = [] tag_instance_backward = [] head = [] progress = 0 for sentence in sentences[:data_size]: parts = [ sentence[i:i + look_back] for i in range(0, len(sentence), look_back) ] for part in parts: word_temp = np.zeros((2, look_back, 300)) tag_temp = np.zeros(( 2, look_back, ), dtype="int32") head_instance = np.zeros((look_back, 1), dtype="float32") for idx in range(len(part)): word = part[idx]["word"] word_temp[0][look_back - len(part) + idx] = word_emb[ word] if word in word_emb else word_emb["UNK"] word_temp[1][look_back - idx - 1] = word_emb[ word] if word in word_emb else word_emb["UNK"] tag = part[idx]["tag"] tag_temp[0][look_back - len(part) + idx] = tag_int[tag] tag_temp[1][look_back - idx - 1] = tag_int[tag] word_instance = np.zeros((2, look_back, 300)) tag_instance = np.zeros(( 2, look_back, ), dtype="int32") for jdx in range(len(part)): word_instance[0][look_back - jdx - 1:] = word_temp[ 0][look_back - len(part):look_back - len(part) + jdx + 1] word_instance[1][look_back - len(part) + jdx:] = word_temp[1][look_back - len(part ):look_back - jdx] tag_instance[0][look_back - jdx - 1:] = tag_temp[0][look_back - len(part):look_back - len(part) + jdx + 1] tag_instance[1][look_back - len(part) + jdx:] = tag_temp[1][look_back - len(part ):look_back - jdx] head_instance = np.zeros((look_back, 1), dtype="float32") for zdx in range(len(part)): head_instance[zdx] = 1 if part[jdx][ "head"] == part[zdx]["word"] else 0 if len(word_full_forward) == 0: word_full_forward = [word_temp[0]] word_full_backward = [word_temp[1]] word_instance_forward = [word_instance[0]] word_instance_backward = [word_instance[1]] tag_full_forward = [tag_temp[0]] tag_full_backward = [tag_temp[1]] tag_instance_forward = [tag_instance[0]] tag_instance_backward = [tag_instance[1]] head = [head_instance] else: word_full_forward = np.append(word_full_forward, [word_temp[0]], axis=0) word_full_backward = np.append(word_full_backward, [word_temp[1]], axis=0) word_instance_forward = np.append( word_instance_forward, [word_instance[0]], axis=0) word_instance_backward = np.append( word_instance_backward, [word_instance[1]], axis=0) tag_full_forward = np.append(tag_full_forward, [tag_temp[0]], axis=0) tag_full_backward = np.append(tag_full_backward, [tag_temp[1]], axis=0) tag_instance_forward = np.append( tag_instance_forward, [tag_instance[0]], axis=0) tag_instance_backward = np.append( tag_instance_backward, [tag_instance[1]], axis=0) head = np.append(head, [head_instance], axis=0) DataUtils.update_message(str(progress) + "/" + str(data_size)) progress += 1 word_data = [(word_full_forward, word_full_backward), (word_instance_forward, word_instance_backward)] tag_data = [(tag_full_forward, tag_full_backward), (tag_instance_forward, tag_instance_backward)] print(word_full_forward.shape, word_instance_forward.shape, head.shape) return word_data, tag_data, head
def __create_xy(self, parse_tree_file, data_size, seq_len, test=False): sentences, words, tags = DataUtils.parse_dependency_tree( parse_tree_file) word_int = DataUtils.create_int_dict(words) tag_int = DataUtils.create_onehot_vectors(tags) self.seq_len = seq_len self.distinct_words = len(words) self.distinct_tags = len(tags) data_len = 0 for i in range(len(sentences)): data_len += int(np.ceil( len(sentences[i]) / seq_len)) * seq_len * seq_len forward = np.zeros(( 2, data_len, seq_len, ), dtype="int32") backward = np.zeros(( 2, data_len, seq_len, ), dtype="int32") probability = np.zeros((data_len, ), dtype="float32") tags = np.zeros((data_len, 18)) idx = 0 for sentence in sentences: parts = [ sentence[i:i + seq_len] for i in range(0, len(sentence), seq_len) ] for part in parts: part_len = len(part) word_forward = np.zeros((seq_len, seq_len), dtype="int32") word_backward = np.zeros((seq_len, seq_len), dtype="int32") for jdx in range(part_len): word_forward[jdx][seq_len - jdx - 1:] = [ word_int[part[i]["word"]] for i in range(jdx + 1) ] word_backward[jdx][seq_len - part_len + jdx:] = [ word_int[part[part_len - i - 1]["word"]] for i in range(part_len - jdx) ] for jdx in range(part_len): for zdx in range(part_len): tags[idx] = tag_int[part[jdx]["tag"]] forward[0][idx] = word_forward[jdx] forward[1][idx] = word_forward[zdx] backward[0][idx] = word_backward[jdx] backward[1][idx] = word_backward[zdx] probability[idx] = 1.0 if part[jdx]["head"] == part[ zdx]["word"] else 0.0 idx += 1 if idx % int(data_len / 100) == 0: DataUtils.update_message( str(int(idx / data_len * 100))) if test: forward = [ np.array(forward[0][5000:10000]), np.array(forward[1][5000:10000]) ] backward = [ np.array(backward[0][5000:10000]), np.array(backward[1][5000:10000]) ] probability = np.array(probability[5000:10000]) tags = np.array(tags[5000:10000]) else: forward = [ np.array(forward[0][:5000]), np.array(forward[1][:5000]) ] backward = [ np.array(backward[0][:5000]), np.array(backward[1][:5000]) ] probability = np.array(probability[:5000]) tags = np.array(tags[:5000]) return [forward[0], backward[0], forward[1], backward[1]], [tags, probability]
def __create_xy(self, dependency_tree, embedding_file, data_size, look_back, test=False): sentences, words, tags = DataUtils.parse_dependency_tree( dependency_tree) word_vectors = DataUtils.create_onehot_vectors(words) word_emb = DataUtils.load_embeddings(embedding_file) tag_int = DataUtils.create_int_dict(tags) data_size = int(len(sentences) * min(data_size, 1)) if test: sentences.reverse() if look_back == 0: for sentence in sentences[:data_size]: look_back = max(look_back, len(sentence)) self.look_back = look_back self.distinct_words = len(words) self.distinct_tags = len(tags) word_data = [] head_data = [] tag_data = [] progress = 0 for sentence in sentences[:data_size]: word_timestep = np.zeros((look_back, 300)) head_timestep = np.zeros((look_back, len(words))) tag_timestep = np.zeros((look_back, ), dtype="int32") timestep = 0 for element in sentence: word = element["word"] if word != "ROOT": word_timestep[timestep % look_back] = word_emb[ word] if word in word_emb else word_emb["UNK"] head = element["head"] head_timestep[timestep % look_back] = word_vectors[head] tag = element["tag"] tag_timestep[timestep % look_back] = tag_int[tag] timestep += 1 if timestep % look_back == 0 or timestep == len(sentence): if len(word_data) == 0: word_data = [word_timestep] head_data = [head_timestep] tag_data = [tag_timestep] else: word_data = np.append(word_data, [word_timestep], axis=0) head_data = np.append(head_data, [head_timestep], axis=0) tag_data = np.append(tag_data, [tag_timestep], axis=0) word_timestep.fill(0) head_timestep.fill(0) tag_timestep.fill(0) DataUtils.update_message(str(progress) + "/" + str(data_size)) progress += 1 word_data = np.array(word_data) head_data = np.array(head_data) tag_data = np.array(tag_data) return word_data, head_data, tag_data