def get_vocab(paths): vocab = Vocabulary(min_freq=10) for path in paths: poems = get_poems(path) update_vocab(vocab, poems) vocab.build_vocab() return vocab
def read_instances_from_file(files, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' vocab = Vocabulary() lb_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents, labels = [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = l[0] sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: sents.append(word_lst) labels.append(label) vocab.add_word_lst(word_lst) lb_vocab.add_word(label) assert len(sents) == len(labels) sets.append({'sents': sents, 'labels': labels}) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.info( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.add_word_lst(['<cls>'] * 6) vocab.build_vocab() lb_vocab.build_vocab() logger.info('Finished. Size of vocab: {}. # Class: {}.'.format( len(vocab), len(lb_vocab))) logger.info('<pad>: {}'.format(vocab.to_index('<pad>'))) logger.info('<unk>: {}'.format(vocab.to_index('<unk>'))) logger.info('<cls>: {}'.format(vocab.to_index('<cls>'))) return sets, vocab, lb_vocab
def load_data_from_file(data_file, build_vocab=True, min_freq=1, max_vocab_size=5000): with open(data_file) as fp: data = [src_lang_model.tokenizer(text.strip()).text for text in fp] data = [remove_punc(tok.split()) for tok in data] if build_vocab: vocab = Vocabulary() vocab.build_vocab(data, lower=True, min_freq=min_freq, max_vocab_size=max_vocab_size) return data, vocab else: return data
class Conversation: def __init__(self): self.dial_info = Dialogue_Info() self.vocab = Vocabulary() self.dial_pair = [] self.train_dial_pair = [] self.valid_dial_pair = [] self.test_dial_pair = [] self.encoded_train_dial_pair = [] self.encoded_valid_dial_pair = [] self.encoded_test_dial_pair = [] def split_data_set(self, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1): train_size = int(len(self.dial_pair) * train_ratio) valid_size = int(len(self.dial_pair) * valid_ratio) test_size = int(len(self.dial_pair) * test_ratio) while len(self.valid_dial_pair) < valid_size: index = random.randint(0, len(self.dial_pair) - 1) if index not in self.valid_dial_pair: self.valid_dial_pair.append(self.dial_pair[index]) while len(self.test_dial_pair) < test_size: index = random.randint(0, len(self.dial_pair) - 1) if index not in self.test_dial_pair and index not in self.valid_dial_pair: self.test_dial_pair.append(self.dial_pair[index]) for index in range(len(self.dial_pair)): if index not in self.valid_dial_pair and index not in self.test_dial_pair: self.train_dial_pair.append(self.dial_pair[index]) if len(self.train_dial_pair) >= train_size: break def build_dialogue_pair(self): for item in self.dial_info.dial_info: item_len = len(item) if item_len % 2 != 0: item = item[0:int(item_len / 2) * 2] for index in range(0, len(item), 2): self.dial_pair.append(list((item[index], item[index + 1]))) def encode_dialogue_pair(self): # self.get_raw_sent(source_path) for train_item in self.train_dial_pair: encode_sent = [ self.vocab.encode_sent(train_item[0]), self.vocab.encode_sent(train_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_train_dial_pair.append(encode_sent) for valid_item in self.valid_dial_pair: encode_sent = [ self.vocab.encode_sent(valid_item[0]), self.vocab.encode_sent(valid_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_valid_dial_pair.append(encode_sent) for test_item in self.test_dial_pair: encode_sent = [ self.vocab.encode_sent(test_item[0]), self.vocab.encode_sent(test_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_test_dial_pair.append(encode_sent) def save_dialogue_pair(self, exp_data_dir): dial_pair_path = os.path.join(exp_data_dir, 'dialogue_pair.json') train_dial_pair_path = os.path.join(exp_data_dir, 'train_dialogue_pair.json') valid_dial_pair_path = os.path.join(exp_data_dir, 'valid_dialogue_pair.json') test_dial_pair_path = os.path.join(exp_data_dir, 'test_dialogue_pair.json') encoded_train_dial_pair_path = os.path.join( exp_data_dir, 'encoded_train_dialogue_pair.json') encoded_valid_dial_pair_path = os.path.join( exp_data_dir, 'encoded_valid_dialogue_pair.json') encoded_test_dial_pair_path = os.path.join( exp_data_dir, 'encoded_test_dialogue_pair.json') with open(dial_pair_path, 'w') as f: json.dump(self.dial_pair, f) with open(train_dial_pair_path, 'w') as f: json.dump(self.train_dial_pair, f) with open(valid_dial_pair_path, 'w') as f: json.dump(self.valid_dial_pair, f) with open(test_dial_pair_path, 'w') as f: json.dump(self.test_dial_pair, f) with open(encoded_train_dial_pair_path, 'w') as f: json.dump(self.encoded_train_dial_pair, f) with open(encoded_valid_dial_pair_path, 'w') as f: json.dump(self.encoded_valid_dial_pair, f) with open(encoded_test_dial_pair_path, 'w') as f: json.dump(self.encoded_test_dial_pair, f) def load_dialogue_pair(self, exp_data_dir): train_dial_pair_path = os.path.join(exp_data_dir, 'train_dialogue_pair.json') valid_dial_pair_path = os.path.join(exp_data_dir, 'valid_dialogue_pair.json') test_dial_pair_path = os.path.join(exp_data_dir, 'test_dialogue_pair.json') with open(train_dial_pair_path, 'r') as f: self.train_dial_pair = json.load(f) with open(valid_dial_pair_path, 'r') as f: self.valid_dial_pair = json.load(f) with open(test_dial_pair_path, 'r') as f: self.test_dial_pair = json.load(f) def create_conversation(self, raw_data_dir, exp_data_dir): threshold = 5 self.dial_info.build_dialogue_info(raw_data_dir) self.dial_info.save_dialogue_info(exp_data_dir) self.vocab.build_vocab(threshold, self.train_dial_pair) self.vocab.save_vocab(exp_data_dir) self.build_dialogue_pair() self.split_data_set() self.encode_dialogue_pair() self.save_dialogue_pair(exp_data_dir) def re_encode_dialogue_pair(self, exp_data_dir): vocab_path = os.path.join(exp_data_dir, 'vocabulary.json') self.vocab.load_vocab(vocab_path) self.load_dialogue_pair(exp_data_dir) self.encode_dialogue_pair() self.save_dialogue_pair(exp_data_dir)
class CluenerProcessor: """Processor for the chinese ner data set.""" def __init__(self,data_dir): self.vocab = Vocabulary() self.data_dir = data_dir def get_vocab(self): vocab_path = self.data_dir / 'vocab.pkl' if vocab_path.exists(): self.vocab.load_from_file(str(vocab_path)) else: files = ["train.txt", "dev.txt", "test.txt"] for file in files: with open(str(self.data_dir / file), 'r') as fr: for line in fr: text = line.strip().split(" ")[0] self.vocab.update(list(text)) self.vocab.build_vocab() self.vocab.save(vocab_path) def get_train_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "train.txt"), "train") def get_dev_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "dev.txt"), "dev") def get_test_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "test.txt"), "test") def _create_examples1(self,input_path,mode): examples = [] with open(input_path, 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] label_entities = line.get('label', None) words = list(text) labels = ['O'] * len(words) if label_entities is not None: for key, value in label_entities.items(): for sub_name, sub_index in value.items(): for start_index, end_index in sub_index: assert ''.join(words[start_index:end_index + 1]) == sub_name if start_index == end_index: labels[start_index] = 'S-' + key else: labels[start_index] = 'B-' + key labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1) json_d['id'] = f"{mode}_{idx}" json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 examples.append(json_d) return examples # 读取数据集 def _create_examples(self,input_path,mode): # 读取数据集 with open(input_path, "r", encoding="utf-8") as f: content = [_.strip() for _ in f.readlines()] # 添加原文句子以及该句子的标签 # 读取空行所在的行号 index = [-1] index.extend([i for i, _ in enumerate(content) if ' ' not in _]) index.append(len(content)) # 按空行分割,读取原文句子及标注序列 sentences, tags = [], [] examples = [] idx = 0 for j in range(len(index)-1): json_d = {} sent, tag = [], [] segment = content[index[j]+1: index[j+1]] for line in segment: sent.append(line.strip().split(" ")[0]) tag.append(line.strip().split(" ")[-1]) sentences.append(' '.join(sent)) tags.append(tag) json_d['id'] = f"{mode}_{idx}" json_d['context'] = " ".join(sent) json_d['tag'] = " ".join(tag) json_d['raw_context'] = "".join(sent) idx += 1 examples.append(json_d) return examples
def read_instances_from_file(files, max_len, keep_case): ''' Collect instances and construct vocab ''' vocab = Vocabulary() pos_vocab = Vocabulary(need_default=False) ner_vocab = Vocabulary(need_default=False) srl_vocab = Vocabulary(need_default=False) chunk_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents = [] pos_labels, ner_labels, srl_labels, chunk_labels = [], [], [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() sent = [] pos_label, ner_label, srl_label, chunk_label = [], [], [], [] for l in lines: l = l.strip() if l == '': if len(sent) > 0: if len(sent) > max_len: trimmed_sent += 1 pos_labels.append(pos_label[:max_len]) ner_labels.append(ner_label[:max_len]) srl_labels.append(srl_label[:max_len]) chunk_labels.append(chunk_label[:max_len]) sents.append(sent[:max_len]) else: pos_labels.append(pos_label) ner_labels.append(ner_label) srl_labels.append(srl_label) chunk_labels.append(chunk_label) sents.append(sent) sent = [] pos_label, ner_label, srl_label, chunk_label = [], [], [], [] else: l = l.split() word = l[0] if not keep_case: word = word.lower() sent.append(word) pos_label.append(l[2]) ner_label.append(l[3]) srl_label.append(l[4]) chunk_label.append(l[5]) vocab.add_word(word) pos_vocab.add_word(l[2]) ner_vocab.add_word(l[3]) srl_vocab.add_word(l[4]) chunk_vocab.add_word(l[5]) sets.append({ 'sents': sents, 'pos_labels': pos_labels, 'ner_labels': ner_labels, 'srl_labels': srl_labels, 'chunk_labels': chunk_labels }) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.warning( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.build_vocab() logger.info('Finished. Size of vocab: {}'.format(len(vocab))) pos_vocab.build_vocab() ner_vocab.build_vocab() srl_vocab.build_vocab() chunk_vocab.build_vocab() logger.info('# class in POS Tagging: {}'.format(len(pos_vocab))) logger.info('# class in NER Tagging: {}'.format(len(ner_vocab))) logger.info('# class in SRL Tagging: {}'.format(len(srl_vocab))) logger.info('# class in Chunking: {}'.format(len(chunk_vocab))) return sets, vocab, [pos_vocab, ner_vocab, srl_vocab, chunk_vocab]
class CluenerProcessor: """Processor for the chinese ner data set.""" def __init__(self,data_dir): self.vocab = Vocabulary() self.data_dir = data_dir def get_vocab(self): vocab_path = self.data_dir / 'vocab.pkl' if vocab_path.exists(): self.vocab.load_from_file(str(vocab_path)) else: files = ["train.json", "dev.json", "test.json"] for file in files: with open(str(self.data_dir / file), 'r') as fr: for line in fr: line = json.loads(line.strip()) text = line['text'] self.vocab.update(list(text)) self.vocab.build_vocab() self.vocab.save(vocab_path) def get_train_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "train.json"), "train") def get_dev_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "dev.json"), "dev") def get_test_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "test.json"), "test") def _create_examples(self,input_path,mode): examples = [] with open(input_path, 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] label_entities = line.get('label', None) words = list(text) labels = ['O'] * len(words) if label_entities is not None: for key, value in label_entities.items(): for sub_name, sub_index in value.items(): for start_index, end_index in sub_index: assert ''.join(words[start_index:end_index + 1]) == sub_name if start_index == end_index: labels[start_index] = 'S-' + key else: labels[start_index] = 'B-' + key labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1) json_d['id'] = f"{mode}_{idx}" json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 examples.append(json_d) return examples
class CluenerProcessor: """Processor for the chinese ner data set.""" def __init__(self,data_dir): self.vocab = Vocabulary() self.data_dir = data_dir def get_vocab(self): """ VOCAB如果存在,直接读取。如果不存在,开始建立并保存 """ vocab_path = self.data_dir / 'vocab.pkl' if vocab_path.exists(): self.vocab.load_from_file(str(vocab_path)) else: #只需要创建train的词表就可以。不使用预训练模型时,非train的字也训练不到 files = ["train.txt", "dev.txt", "test.txt"] for file in files: with open(str(self.data_dir / file), 'r',encoding='utf-8') as fr: for line in fr: #line = json.loads(line.strip()) line = line.strip().split(' ') text = line[0] self.vocab.update(list(text)) self.vocab.build_vocab() self.vocab.save(vocab_path) def get_train_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "train.txt"), "train") def get_dev_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "dev.txt"), "dev") def get_test_examples(self): """See base class.""" return self._create_examples(str(self.data_dir / "test.txt"), "test") def _create_examples(self,input_path,mode): """ Returns:List[Dict] [{'context':['中','国','人'] ,'tag':[B-name,I-name,I-name]} ,{},{}...] """ examples = [] with open(input_path, 'r',encoding='utf-8') as f: words,labels = [],[] flag = False for line in f: json_d = {} content = line.strip() tokens = content.split(' ') #[word,label] if len(tokens) == 2: words.append(tokens[0]) labels.append(tokens[-1]) if tokens[-1] != 'O': flag = True else: if len(content) == 0 and len(words)>0: if flag: json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) words, labels = [], [] examples.append(json_d) flag = False else: words = [] labels = [] return examples