def tokenize_sentence(X_text_list_train, X_text_list_test, MAX_SENTENCE_LEN): x_encoder = StaticTokenizerEncoder( sample=X_text_list_train, append_eos=False, tokenize=lambda x: x, ) x_encoded_train = [x_encoder.encode(text) for text in X_text_list_train] x_padded_train = torch.LongTensor( pad_sequence(x_encoded_train, MAX_SENTENCE_LEN + 1)) x_encoded_test = [x_encoder.encode(text) for text in X_text_list_test] x_padded_test = torch.LongTensor( pad_sequence(x_encoded_test, MAX_SENTENCE_LEN + 1)) if x_padded_train.shape[1] > x_padded_test.shape[1]: x_padded_test = torch.cat( ( x_padded_test, torch.zeros( x_padded_test.shape[0], x_padded_train.shape[1] - x_padded_test.shape[1], ), ), dim=1, ).type(torch.long) return x_encoder, x_padded_train, x_padded_test
def __init__(self, json, text_encoder=None, label_encoder=None, vocab=None, mode='train'): ''' Initialization Arguments: json: Json file containing the data. Structure of json file: e.g: json: {'data' : [{'id': filename, 'title': title of page, 'toc': [list of items in table of contents section of wikipage], 'intro':introduction of wiki page, 'label':'positive'/'negative' flag}] } Labels-required only when mode = 'train' text_encoder: encoder object that encodes tokens to their unique integer ids label_encoder: encoder object that encodes labels to their unique integer ids vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels ''' self.data = json assert 'data' in self.data # Define the mode in which the dataset object is to be used self.mode = mode # Define text encoder and vocabulary if text_encoder: self._text_encoder = text_encoder self._vocab = self._text_encoder.vocab elif vocab: self._vocab = vocab self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) else: self._vocab = self.create_vocab() self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) self._vocab_size = self._text_encoder.vocab_size # Define label encoder if self.mode == 'train': if label_encoder: self._label_encoder = label_encoder else: self._label_encoder = LabelEncoder( [sample['label'] for sample in self.data['data']]) self._label_size = self._label_encoder.vocab_size else: self._label_encoder = None self._label_size = None
def tokenize_pos_tags(X_tags_train, X_tags_test): x_postag_encoder = StaticTokenizerEncoder( sample=X_tags_train, append_eos=False, tokenize=lambda x: x, ) x_postag_encoded_train = [ x_postag_encoder.encode(text) for text in X_tags_train ] x_postag_padded_train = torch.LongTensor( pad_sequence(x_postag_encoded_train, MAX_SENTENCE_LEN + 1)) # x_postag_ohe_train = torch.nn.functional.one_hot(x_postag_padded_train) x_postag_encoded_test = [ x_postag_encoder.encode(text) for text in X_tags_test ] x_postag_padded_test = torch.LongTensor( pad_sequence(x_postag_encoded_test, MAX_SENTENCE_LEN + 1)) if x_postag_padded_train.shape[1] > x_postag_padded_test.shape[1]: x_postag_padded_test = torch.cat( ( x_postag_padded_test, torch.zeros( x_postag_padded_test.shape[0], x_postag_padded_train.shape[1] - x_postag_padded_test.shape[1], ), ), dim=1, ).type(torch.long) # x_postag_ohe_test = torch.nn.functional.one_hot(x_postag_padded_test) return x_postag_encoder, x_postag_padded_train, x_postag_padded_test
def train( self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=300 ): """train a tokenizer""" # create genartor for incorrect data only dataset_example_gen = ( ex["incorrect"] for ex in itr.islice(pair_dataset, self._tokenizer_max_seq) ) self.tokenizer = StaticTokenizerEncoder( dataset_example_gen, min_occurrences=min_occurrences, append_eos=append_eos, append_sos=append_sos, tokenize=uni_bi_grams_vocab_gen, detokenize=self._detokenize #lambda x: "".join(x), # concat all tokens ) self.tokenizer.tokenize =bigrams_tokenize #ngram_tokenizer(self.ngrams) # after training set the variables self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0
def tokenize_sentence(X_text_list_train, X_text_list_test, max_sent_len=800): """ Tokenized sentences with train data list and fits on both train + test :param X_text_list_train: :param X_text_list_test: :param max_sent_len: Max sentence len to pad to, defaults to 800 :return: x_encoder, x_padded_train, x_padded_test """ x_encoder = StaticTokenizerEncoder( sample=X_text_list_train, append_eos=False, tokenize=lambda x: x, ) x_encoded_train = [x_encoder.encode(text) for text in X_text_list_train] x_padded_train = torch.LongTensor( pad_sequence(x_encoded_train, max_sent_len + 1) ) x_encoded_test = [x_encoder.encode(text) for text in X_text_list_test] x_padded_test = torch.LongTensor(pad_sequence(x_encoded_test, max_sent_len + 1)) if x_padded_train.shape[1] > x_padded_test.shape[1]: x_padded_test = torch.cat( ( x_padded_test, torch.zeros( x_padded_test.shape[0], x_padded_train.shape[1] - x_padded_test.shape[1], ), ), dim=1, ).type(torch.long) return x_encoder, x_padded_train, x_padded_test
def generate_encodings(self, data, labels): encoder = StaticTokenizerEncoder(data, tokenize=lambda s: s.split(), min_occurrences=3) encoded_data = [encoder.encode(document) for document in data] encoded_data = [pad_tensor(x, length=10000) for x in encoded_data] data = {'labels': labels, 'inputs': encoded_data} return pd.DataFrame(data=data)
def create_tokenizer(): """ Create and save Pytorch-NLP tokenizer. Args: root (string): Directory of TIMIT. """ transcripts = pd.read_csv('TRAIN.csv')['transcript'] tokenizer = StaticTokenizerEncoder(transcripts, append_sos=True, append_eos=True, tokenize=data_utils.encode_fn, detokenize=data_utils.decode_fn) torch.save(tokenizer, 'tokenizer.pth')
def load(batch_size, augmentation, split, shuffle=True): """ Args: split (string): Which of the subset of data to take. One of 'train' or 'test'. batch_size (integer): Batch size. augmentation (bool): Whether to apply data augmentation. Only work on training set. Return: loader (DataLoader): A DataLoader can generate batches of (image, label sequence). tokenizer (Pytorch-NLP’s StaticTokenizerEncoder): A tokenizer to encode/decode label sequences. """ assert split in ['train', 'test'] train_dataset = load_json('train') + load_json('extra') train_dataset = preprocess_label(train_dataset) tokenizer = StaticTokenizerEncoder( [x['anno']['label'] for x in train_dataset], tokenize=lambda s: s.split(), append_eos=True, reserved_tokens=['<pad>', '<unk>', '</s>']) print(tokenizer.vocab) if split == 'train': dataset = train_dataset else: dataset = load_json('test') dataset = preprocess_label(dataset) print("Compute bounding boxes ...") dataset = compute_bbox(dataset) dataset = SVHN(dataset, augmentation=(augmentation and split == 'train')) print("Dataset size:", len(dataset)) loader = DataLoader( dataset, batch_size, shuffle=shuffle, collate_fn=lambda batch: dataset.generateBatch(batch, tokenizer), num_workers=4, pin_memory=True) return loader, tokenizer
def load_data(dict_fn, data_fn, batch_size, start_sign, end_sign, checkpoint_dir, max_length, max_train_data_size=0): """ 数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能 使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix Args: dict_fn: 将训练数据的字典保存,用于以后使用,路径 data_fn: 分词好的训练数据路径 batch_size: batch大小 start_sign: 开始标记 end_sign: 结束标记 checkpoint_dir: 检查点保存路径 max_length: 最大句子长度 max_train_data_size: 最大训练数据大小 Returns: dataset: PyTorch的DataLoader steps_per_epoch: 每轮的步数 checkpoint_prefix: 保存检查点的前缀 """ print("训练数据读取中...") (input_lang, target_lang), diag_weight = read_tokenized_data(data_fn, start_sign, end_sign, max_train_data_size) diag_weight = torch.tensor(diag_weight, dtype=torch.float32) # 合并input,target用于生成统一的字典 lang = np.hstack((input_lang, target_lang)) print("读取完成,正在格式化训练数据...") tokenizer = StaticTokenizerEncoder(sample=lang, tokenize=lambda x: x.split()) # 将文本序列转换文token id之后,并进行填充 input_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in input_lang ] target_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in target_lang ] input_tensor = stack_and_pad_tensors(input_data)[0] target_tensor = stack_and_pad_tensors(target_data)[0] print("格式化完成,正在整理训练数据并保存字典") word_index = {} vocab_list = tokenizer.vocab for i in range(tokenizer.vocab_size): word_index[vocab_list[i]] = i word_index[i] = vocab_list[i] with open(dict_fn, 'w', encoding='utf-8') as file: file.write(json.dumps(word_index, indent=4, ensure_ascii=False)) print("数据字典保存完成!") dataset = PairDataset(input_tensor, target_tensor, diag_weight) loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=2) steps_per_epoch = len(input_tensor) // batch_size return loader, steps_per_epoch
class BiGramTokenizerEncoder(TokenizerEncoder): def __init__(self): self.vocab_size = None self.padding_index = None self.ngrams = 2 # determine how many sequences we take to build the vocabulary self._tokenizer_max_seq = 3 * 10 ** 5 self.tokenizer_name = "bigram_corrector_tokenizer" def train( self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=300 ): """train a tokenizer""" # create genartor for incorrect data only dataset_example_gen = ( ex["incorrect"] for ex in itr.islice(pair_dataset, self._tokenizer_max_seq) ) self.tokenizer = StaticTokenizerEncoder( dataset_example_gen, min_occurrences=min_occurrences, append_eos=append_eos, append_sos=append_sos, tokenize=uni_bi_grams_vocab_gen, detokenize=self._detokenize #lambda x: "".join(x), # concat all tokens ) self.tokenizer.tokenize =bigrams_tokenize #ngram_tokenizer(self.ngrams) # after training set the variables self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0 def _detokenize(self, tokens): return "".join(tokens) def encode(self, text): pass def encode_batch(self, samples): """ Encodes list of strings Args: ----------- samples: list of strings """ # it is compatible with pytrochNLP tokens, lengths = self.tokenizer.batch_encode(samples) return tokens, lengths def decode(self, text): pass
class WikiDataset(Dataset): ''' A custom dataset object that encodes a tokenized text and its labels according to the corresponding encoders ''' def __init__(self, json, text_encoder=None, label_encoder=None, vocab=None, mode='train'): ''' Initialization Arguments: json: Json file containing the data. Structure of json file: e.g: json: {'data' : [{'id': filename, 'title': title of page, 'toc': [list of items in table of contents section of wikipage], 'intro':introduction of wiki page, 'label':'positive'/'negative' flag}] } Labels-required only when mode = 'train' text_encoder: encoder object that encodes tokens to their unique integer ids label_encoder: encoder object that encodes labels to their unique integer ids vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels ''' self.data = json assert 'data' in self.data # Define the mode in which the dataset object is to be used self.mode = mode # Define text encoder and vocabulary if text_encoder: self._text_encoder = text_encoder self._vocab = self._text_encoder.vocab elif vocab: self._vocab = vocab self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) else: self._vocab = self.create_vocab() self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) self._vocab_size = self._text_encoder.vocab_size # Define label encoder if self.mode == 'train': if label_encoder: self._label_encoder = label_encoder else: self._label_encoder = LabelEncoder( [sample['label'] for sample in self.data['data']]) self._label_size = self._label_encoder.vocab_size else: self._label_encoder = None self._label_size = None def __len__(self): ''' Size of dataset ''' return len(self.data['data']) def __getitem__(self, idx): ''' Extract item corresponding to idx'th index in data ''' item = self.data['data'][idx] intro_enc = self._text_encoder.encode(item['intro']) toc = item['toc'] if toc == []: toc_enc = self._text_encoder.encode('.') else: toc = ' '.join(toc) toc_enc = self._text_encoder.encode(toc) title_enc = self._text_encoder.encode(item['title']) if self.mode == 'train': return title_enc, toc_enc, intro_enc, self._label_encoder.encode( item['label']).view(-1) else: return title_enc, toc_enc, intro_enc @property def vocab_size(self): return self._vocab_size @property def label_size(self): return self._label_size @property def text_encoder(self): return self._text_encoder @property def label_encoder(self): return self._label_encoder @property def vocab(self): return self._vocab def create_vocab(self, remove_less_freq_words=True, threshold=1): ''' Creates vocabulary from the dataset tokens Returns: List of unique tokens in dataset ''' temp_vocab = [] for sample in self.data['data']: temp_vocab.extend(sample['title'].split()) temp_vocab.extend(' '.join(sample['toc']).split()) temp_vocab.extend(sample['intro'].split()) vocab = [] if remove_less_freq_words: count_dict = collections.Counter(temp_vocab) for word in count_dict.keys(): if count_dict[word] > threshold: vocab.append(word) else: vocab = sorted(list(set(temp_vocab))) return vocab def split(self, x): ''' Splits the text into tokens ''' return x.split() def collate_fn(self, batch, padding=True): """ Collate function needs to be passed to the pytorch dataloader Returns: (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths labels: tensor containing labels for the batch """ if self.mode == 'train': title, toc, intro, labels = zip(*batch) labels = torch.cat(labels) else: title, toc, intro = zip(*batch) if isinstance(intro, collections.Sequence): if padding: title, title_lengths = stack_and_pad_tensors(title) toc, toc_lengths = stack_and_pad_tensors(toc) intro, intro_lengths = stack_and_pad_tensors(intro) if self.mode == 'train': return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths), labels else: return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths) else: return batch @classmethod def fromJsonFile(cls, json_file, text_encoder=None, label_encoder=None, vocab=None, mode='train'): ''' Read data from json file Arguments: json_file: string specifying location to json_file ''' with open(json_file, 'r') as f: json_data = json.load(f) return cls(json_data, text_encoder, label_encoder, vocab, mode)
def encoder(input_): return StaticTokenizerEncoder([input_])
def get_tokenizer(list_training_sentences): tokenizer = StaticTokenizerEncoder(sample=list_training_sentences, min_occurrences=2, append_sos=False, append_eos=True) return tokenizer