def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=0, normalize_digits=False): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) sentences = parse(open(data_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form pos_alphabet.add(pos) type_alphabet.add(type) if form not in vocab_set and (form in embedd_dict or form.lower() in embedd_dict): vocab_set.add(form) vocab_list.append(form) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', defualt_value=True, singleton=False) char_alphabet = Alphabet('character', defualt_value=True) pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) char_alphabet.add(ROOT_CHAR) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) char_alphabet.add(END_CHAR) pos_alphabet.add(END_POS) type_alphabet.add(END_TYPE) vocab = defaultdict(int) sentences = parse(open(train_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form vocab[form] += 1 pos_alphabet.add(pos) type_alphabet.add(type) # collect singletons singletons = set( [word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [ word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence ] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet, max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False): data = [[] for _ in _buckets] max_char_length = [0 for _ in _buckets] print('Reading data from %s' % source_path) counter = 0 reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) while inst is not None and (not max_size or counter < max_size): counter += 1 if counter % 10000 == 0: print("reading data: %d" % counter) inst_size = inst.length() sent = inst.sentence for bucket_id, bucket_size in enumerate(_buckets): if inst_size < bucket_size: data[bucket_id].append([ sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids ]) max_len = max([len(char_seq) for char_seq in sent.char_seqs]) if max_char_length[bucket_id] < max_len: max_char_length[bucket_id] = max_len break inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) reader.close() print("Total number of data: %d" % counter) bucket_sizes = [len(data[b]) for b in range(len(_buckets))] data_tensors = [] for bucket_id in range(len(_buckets)): bucket_size = bucket_sizes[bucket_id] if bucket_size == 0: data_tensors.append((1, 1)) continue bucket_length = _buckets[bucket_id] char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id]) wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64) pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) tid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) masks = np.zeros([bucket_size, bucket_length], dtype=np.float32) single = np.zeros([bucket_size, bucket_length], dtype=np.int64) lengths = np.empty(bucket_size, dtype=np.int64) for i, inst in enumerate(data[bucket_id]): wids, cid_seqs, pids, hids, tids = inst inst_size = len(wids) lengths[i] = inst_size # word ids wid_inputs[i, :inst_size] = wids wid_inputs[i, inst_size:] = PAD_ID_WORD for c, cids in enumerate(cid_seqs): cid_inputs[i, c, :len(cids)] = cids cid_inputs[i, c, len(cids):] = PAD_ID_CHAR cid_inputs[i, inst_size:, :] = PAD_ID_CHAR # pos ids pid_inputs[i, :inst_size] = pids pid_inputs[i, inst_size:] = PAD_ID_TAG # type ids tid_inputs[i, :inst_size] = tids tid_inputs[i, inst_size:] = PAD_ID_TAG # heads hid_inputs[i, :inst_size] = hids hid_inputs[i, inst_size:] = PAD_ID_TAG # masks masks[i, :inst_size] = 1.0 for j, wid in enumerate(wids): if word_alphabet.is_singleton(wid): single[i, j] = 1 words = torch.from_numpy(wid_inputs) chars = torch.from_numpy(cid_inputs) pos = torch.from_numpy(pid_inputs) heads = torch.from_numpy(hid_inputs) types = torch.from_numpy(tid_inputs) masks = torch.from_numpy(masks) single = torch.from_numpy(single) lengths = torch.from_numpy(lengths) data_tensor = { 'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths } data_tensors.append(data_tensor) return data_tensors, bucket_sizes
def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet, max_size=None, normalize_digits=True): data = [] max_length = 0 max_char_length = 0 print('Reading data from %s' % source_path) counter = 0 reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) inst = reader.getNext(normalize_digits) while inst is not None and (not max_size or counter < max_size): counter += 1 if counter % 10000 == 0: print("reading data: %d" % counter) sent = inst.sentence data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids]) max_len = max([len(char_seq) for char_seq in sent.char_seqs]) if max_char_length < max_len: max_char_length = max_len if max_length < inst.length(): max_length = inst.length() inst = reader.getNext(normalize_digits) reader.close() print("Total number of data: %d" % counter) data_size = len(data) char_length = min(MAX_CHAR_LENGTH, max_char_length) wid_inputs = np.empty([data_size, max_length], dtype=np.int64) cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64) pid_inputs = np.empty([data_size, max_length], dtype=np.int64) chid_inputs = np.empty([data_size, max_length], dtype=np.int64) nid_inputs = np.empty([data_size, max_length], dtype=np.int64) masks = np.zeros([data_size, max_length], dtype=np.float32) single = np.zeros([data_size, max_length], dtype=np.int64) lengths = np.empty(data_size, dtype=np.int64) for i, inst in enumerate(data): wids, cid_seqs, pids, chids, nids = inst inst_size = len(wids) lengths[i] = inst_size # word ids wid_inputs[i, :inst_size] = wids wid_inputs[i, inst_size:] = PAD_ID_WORD for c, cids in enumerate(cid_seqs): cid_inputs[i, c, :len(cids)] = cids cid_inputs[i, c, len(cids):] = PAD_ID_CHAR cid_inputs[i, inst_size:, :] = PAD_ID_CHAR # pos ids pid_inputs[i, :inst_size] = pids pid_inputs[i, inst_size:] = PAD_ID_TAG # chunk ids chid_inputs[i, :inst_size] = chids chid_inputs[i, inst_size:] = PAD_ID_TAG # ner ids nid_inputs[i, :inst_size] = nids nid_inputs[i, inst_size:] = PAD_ID_TAG # masks masks[i, :inst_size] = 1.0 for j, wid in enumerate(wids): if word_alphabet.is_singleton(wid): single[i, j] = 1 words = torch.from_numpy(wid_inputs) chars = torch.from_numpy(cid_inputs) pos = torch.from_numpy(pid_inputs) chunks = torch.from_numpy(chid_inputs) ners = torch.from_numpy(nid_inputs) masks = torch.from_numpy(masks) single = torch.from_numpy(single) lengths = torch.from_numpy(lengths) data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks, 'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths} return data_tensor, data_size
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0 or line.startswith( '#'): # conllu format. Attardi continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu. Attardi continue for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub( "0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', defualt_value=True, singleton=True) char_alphabet = Alphabet('character', defualt_value=True) pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) char_alphabet.add(ROOT_CHAR) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) char_alphabet.add(END_CHAR) pos_alphabet.add(END_POS) type_alphabet.add(END_TYPE) vocab = defaultdict(int) # Attardi with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0 or line.startswith('#'): # conllu. Attardi continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu. Attardi continue for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub( "0", tokens[1]) if normalize_digits else tokens[1] vocab[word] += 1 pos = tokens[4] pos_alphabet.add(pos) type = tokens[7] type_alphabet.add(type) # collect singletons singletons = set( [word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [ word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence ] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet, max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False): data = [] max_length = 0 max_char_length = 0 print('Reading data from %s' % source_path) counter = 0 data_tensors = [] reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) while inst is not None and (not max_size or counter < max_size): counter += 1 if counter % 10000 == 0: print("reading data: %d" % counter) sent = inst.sentence data.append([sent.bert_ids, sent.sub_idx, sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids]) max_len = max([len(char_seq) for char_seq in sent.char_seqs]) if max_char_length < max_len: max_char_length = max_len if max_length < inst.length(): max_length = inst.length() inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) reader.close() print("Total number of data: %d" % counter) data_size = len(data) char_length = min(MAX_CHAR_LENGTH, max_char_length) bert_wid_inputs = np.empty([data_size, max_length], dtype=np.int64) subword_word_indicator = np.empty([data_size, max_length], dtype=np.int64) wid_inputs = np.empty([data_size, max_length], dtype=np.int64) cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64) pid_inputs = np.empty([data_size, max_length], dtype=np.int64) hid_inputs = np.empty([data_size, max_length], dtype=np.int64) tid_inputs = np.empty([data_size, max_length], dtype=np.int64) masks = np.zeros([data_size, max_length], dtype=np.float32) single = np.zeros([data_size, max_length], dtype=np.int64) lengths = np.empty(data_size, dtype=np.int64) bert_lengths = np.empty(data_size, dtype=np.int64) for i, inst in enumerate(data): bert_wids, subword_ids, wids, cid_seqs, pids, hids, tids = inst inst_size = len(wids) bpe_st_size = len(bert_wids) lengths[i] = inst_size bert_lengths[i] = bpe_st_size # word ids wid_inputs[i, :inst_size] = wids wid_inputs[i, inst_size:] = PAD_ID_WORD # bert ids bert_wid_inputs[i, :bpe_st_size] = bert_wids bert_wid_inputs[i, bpe_st_size:] = 0 # subword subword_word_indicator[i, :len(subword_ids)] = subword_ids subword_word_indicator[i, len(subword_ids):] = 0 for c, cids in enumerate(cid_seqs): cid_inputs[i, c, :len(cids)] = cids cid_inputs[i, c, len(cids):] = PAD_ID_CHAR cid_inputs[i, inst_size:, :] = PAD_ID_CHAR # pos ids pid_inputs[i, :inst_size] = pids pid_inputs[i, inst_size:] = PAD_ID_TAG # type ids tid_inputs[i, :inst_size] = tids tid_inputs[i, inst_size:] = PAD_ID_TAG # heads hid_inputs[i, :inst_size] = hids hid_inputs[i, inst_size:] = PAD_ID_TAG # masks masks[i, :inst_size] = 1.0 for j, wid in enumerate(wids): if word_alphabet.is_singleton(wid): single[i, j] = 1 bert_wid_inputs = torch.from_numpy(bert_wid_inputs) words = torch.from_numpy(wid_inputs) chars = torch.from_numpy(cid_inputs) pos = torch.from_numpy(pid_inputs) heads = torch.from_numpy(hid_inputs) types = torch.from_numpy(tid_inputs) masks = torch.from_numpy(masks) single = torch.from_numpy(single) lengths = torch.from_numpy(lengths) bert_lengths = torch.from_numpy(bert_lengths) subword_word_indicator = torch.from_numpy(subword_word_indicator) data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths, 'BERT_LENGTH': bert_lengths, 'BERT_WORD': bert_wid_inputs, 'SUB_IDX': subword_word_indicator} return data_tensor, data_size