def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0, handle_exceed='start_slice', add_end_tok=True, **kwargs): feature_dict_list = [] tokenized_target = tokenizer.tokenize(target) if target is not None else [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed) for t_input in t_input_list: # -2 for cls and sep and prediction end sep row_dict = dict() tokenized_input = [ tok.tok_begin(tokenizer) ] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)] row_dict['target'] = [-1] * maxlen row_dict['target_once'] = [-1] * maxlen tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) target_start = len(tokenized_input_id) target_end = maxlen target_length = target_end - target_start if target is not None: if add_end_tok: tokenized_target += [tok.tok_sep(tokenizer)] tokenized_target_id = [] tokenized_target_once_id = [-1] * len(tokenized_input) target_ids = tokenizer.convert_tokens_to_ids(tokenized_target) target_length = len(target_ids) tokenized_target_id.extend(target_ids) tokenized_target_once_id.extend(target_ids) target_end = len(tokenized_target_id) - 1 tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) tokenized_target_once_id.extend( [-1] * (maxlen - len(tokenized_target_once_id))) row_dict['target'] = tokenized_target_id row_dict['target_once'] = tokenized_target_once_id input_length = min(maxlen, target_start * 3) tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id))) mask_id = [1] * input_length mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['input'] = tokenized_input_id row_dict['mask'] = mask_id row_dict['start'] = target_start row_dict['end'] = target_end row_dict['input_length'] = input_length row_dict['target_length'] = target_length feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, tasks, task, input, target=None, handle_exceed='slide', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() row_dict['task'] = task input_token = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_sep(tokenizer)] tokenized_input_id = tokenizer.convert_tokens_to_ids(input_token) mask_id = [1] * len(tokenized_input_id) tokenized_input_id.extend([tokenizer.pad_token_id] * (maxlen - len(tokenized_input_id))) mask_id.extend([-1] * (maxlen - len(mask_id))) row_dict['input'] = tokenized_input_id row_dict['mask'] = mask_id row_dict['target'] = [-1] if target is not None: if 'multi_label' in task: mlb = MultiLabelBinarizer(classes=tasks[task]) tar = mlb.fit_transform([target]) tokenize_label = tar else: tokenize_label = [tasks[task].index(target[0])] row_dict['target'] = tokenize_label feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] pred_len = len(tokenizer.convert_tokens_to_ids( target)) if target is not None else len(previous) t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - pred_len - reserved_len, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_begin(tokenizer)] t_input.extend(previous) t_input_id = tokenizer.convert_tokens_to_ids(t_input) target_start = len(t_input_id) - 1 row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen if target is not None: t_input_id.extend( tokenizer.convert_tokens_to_ids(target[:len(target)])) tokenized_target_id = [-1] * target_start # tokenized_target_id = tokenizer.convert_tokens_to_ids(t_input[1:]) tokenized_target_id.extend( tokenizer.convert_tokens_to_ids(target + [tok.tok_sep(tokenizer)])) tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = [-1] * target_start # tokenized_ntarget_id = tokenizer.convert_tokens_to_ids(t_input[1:]) tokenized_ntarget_id.extend(tokenized_ntarget) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id mask_id = [1] * len(t_input_id) t_input_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(t_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['input'] = t_input_id row_dict['mask'] = mask_id row_dict['start'] = target_start feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2 - len(previous) - 1, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer)] + \ t_input[:maxlen - reserved_len - 2] + \ [tok.tok_sep(tokenizer)] t_input.extend(previous) t_input.append(tok.tok_mask(tokenizer)) t_input_id = tokenizer.convert_tokens_to_ids(t_input) mask_id = [1] * len(t_input) target_start = len(t_input_id) - 1 target_end = maxlen t_input_id.extend([0] * (maxlen - len(t_input_id))) row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen tokenized_target_id = None if target is not None: tokenized_target_id = [-1] * target_start tokenized_target_id.append( tokenizer.convert_tokens_to_ids(target)[-1]) target_end = len(tokenized_target_id) - 1 tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = [-1] * target_start tokenized_ntarget_id.extend(tokenized_ntarget) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id mask_id.extend([0] * (maxlen - len(mask_id))) type_id = [0] * len(t_input) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = t_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id row_dict['start'] = target_start row_dict['end'] = target_end feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, target=None, handle_exceed='start_slice', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer)] + t_input + [tok.tok_sep(tokenizer)] tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) row_dict['target'] = [-1] * maxlen if target is not None: tokenized_target = [] targets_pointer = 0 for tok_pos, text in enumerate(tokenized_input): if text == tok.tok_mask(tokenizer): if targets_pointer == int(target): tok_target = 1 else: tok_target = 0 tokenized_target.extend([tok_target]) targets_pointer += 1 else: tokenized_target.append(-1) tokenized_target.extend([-1] * (maxlen - len(tokenized_target))) row_dict['target'] = tokenized_target target_pos_list = [] for tok_pos, text in enumerate(tokenized_input): if text == tok.tok_mask(tokenizer): target_pos_list.append(tok_pos) target_pos_list.extend([0] * (4 - len(target_pos_list))) if len(target_pos_list) != 4: continue row_dict['target_pos'] = target_pos_list mask_id = [1] * len(tokenized_input) type_id = [0] * len(tokenized_input) tokenized_input_id.extend( [tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)])[0]] * (maxlen - len(tokenized_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = tokenized_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0, handle_exceed='start_slice', add_end_tok=True, **kwargs): feature_dict_list = [] tokenized_target = tokenizer.tokenize(target) if target is not None else [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed) for t_input in t_input_list: # -2 for cls and sep and prediction end sep row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer)] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)] mask_id = [1] * len(tokenized_input) type_id = [0] * len(tokenized_input) row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) target_start = len(tokenized_input_id) if target is not None: if add_end_tok: tokenized_target += [tok.tok_sep(tokenizer)] tokenized_target_id = [-1] * len(tokenized_input) tokenized_target_id.extend(tokenizer.convert_tokens_to_ids(tokenized_target)) tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) print(len(tokenized_target_id), len(tokenized_input), len(t_input), tokenized_target_id) row_dict['target'] = tokenized_target_id if ntarget is not None: tokenized_ntarget = tokenizer.tokenize(ntarget) tokenized_ntarget_id = [-1] * target_start tokenized_ntarget_id.extend(tokenizer.convert_tokens_to_ids(tokenized_ntarget)) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) row_dict['ntarget'] = tokenized_ntarget_id tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = tokenized_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id row_dict['start'] = target_start feature_dict_list.append(row_dict) return feature_dict_list
def predict(self, input='', topK=1, topP=0.85, mode=['greedy', 'topK', 'topP'], decodenum=1, filtersim=True, reserved_len=0, task=None, handle_exceed='start_slice'): filtersim = json.loads(str(filtersim).lower()) topK = int(topK) topP = float(topP) decodenum = int(decodenum) mode = mode[0] if isinstance(mode, list) else mode.lower() self.eval() sequences = [[[], 1.0]] with torch.no_grad(): while True: all_candidates = list() exceed = False for seq in sequences: if tok.tok_sep(self.tokenizer) not in seq[0]: tokens, score = seq feature_dict = get_feature_from_data(self.tokenizer, self.maxlen, input, tokens, reserved_len=reserved_len, handle_exceed=handle_exceed)[-1] # check input exceed if len(tokens) >= self.maxlen or feature_dict['start'] >= self.maxlen: exceed = True all_candidates.append(seq) continue for k, v in feature_dict.items(): feature_dict[k] = [v] predictions = self.forward(feature_dict, eval=True, use_prev=True) token_prob_list = predictions['label_prob_all'][0] # topK topP if 'top' in mode: prob_list = [prob for _, prob in token_prob_list] if 'topk' in mode: sample_list = prob_list[:topK] decode_range = max(decodenum, topK) prob_norm = [float(i) / sum(sample_list) for i in sample_list] choice_list = np.random.choice(sample_list, p=prob_norm, size=decode_range, replace=False) else: topP_list = np.cumsum(prob_list) index_overP = [i for i, x in enumerate(topP_list) if x > topP] index_overP = 0 if len(index_overP) < 1 else index_overP[0] sample_list = prob_list[:index_overP + 1] prob_norm = [float(i) / sum(sample_list) for i in sample_list] choice_list = np.random.choice(sample_list, p=prob_norm, size=decodenum) for idx in range(decodenum): sampling_index = prob_list.index(choice_list[idx]) k, v = token_prob_list[sampling_index] candidate = [tokens + [k], score + -log(v)] all_candidates.append(candidate) # greedy / beam search else: for k, v in token_prob_list[:50]: if len(tokens) > 0 and tokens[-1] == k or len(k) < 1: continue candidate = [tokens + [k], score + -log(v)] all_candidates.append(candidate) else: all_candidates.append(seq) ordered = sorted(all_candidates, key=lambda tup: tup[1]) if filtersim: self._filterSimilar(ordered, decodenum) sequences = ordered[:decodenum] stop = 0 for i in sequences: # i[0] - sequence,i[1] - sequence score if tok.tok_sep(self.tokenizer) in i[0] or i[1] > self.maxlen: stop += 1 if stop == len(sequences) or exceed: break for i in range(len(sequences)): if tok.tok_sep(self.tokenizer) in sequences[i][0]: # remove sep token sequences[i][0] = sequences[i][0][:sequences[i][0].index(tok.tok_sep(self.tokenizer))] sequences[i][0] = "".join(self.tokenizer.convert_tokens_to_string(sequences[i][0])) result_dict = { 'label_map': sequences } self.encoder_hidden = None return [i[0] for i in sequences], [result_dict]
def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] pred_len = len(tokenizer.convert_tokens_to_ids( target)) + 1 if target is not None else len(previous) - 1 t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2 - pred_len, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer)] + \ t_input[:maxlen - reserved_len - 2] + \ [tok.tok_sep(tokenizer)] t_input_id = tokenizer.convert_tokens_to_ids(t_input) encoder_mask_id = [1] * (len(t_input)) encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id))) t_input_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(t_input_id))) if target is not None: tokenized_target_id = [] tokenized_prev_id = [] tokenized_prev_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_begin(tokenizer)] + target)) tokenized_target_id.extend( tokenizer.convert_tokens_to_ids(target + [tok.tok_sep(tokenizer)])) decoder_mask_id = [1] * (len(tokenized_prev_id)) decoder_mask_id.extend([0] * (maxlen - len(decoder_mask_id))) tokenized_prev_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(tokenized_prev_id))) tokenized_target_id.extend([-100] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id row_dict['prev'] = tokenized_prev_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = tokenized_ntarget tokenized_ntarget_id.extend( [-100] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id else: tokenized_prev_id = [ tokenizer.convert_tokens_to_ids(tok.tok_begin(tokenizer)) ] tokenized_prev_id.extend(tokenizer.convert_tokens_to_ids(previous)) target_start = len(tokenized_prev_id) - 1 row_dict['start'] = target_start decoder_mask_id = [1] * (len(tokenized_prev_id)) row_dict['prev'] = tokenized_prev_id row_dict['input'] = t_input_id row_dict['encoder_mask'] = encoder_mask_id row_dict['decoder_mask'] = decoder_mask_id feature_dict_list.append(row_dict) return feature_dict_list
def preprocessing_data(item, tokenizer, maxlen=512, handle_exceed='start_slice', likelihood=['none', 'pos', 'neg', 'both'], reserved_len=0, **kwargs): likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood tasks, task, input, targets = item p_target, n_target = targets input = input.strip() tokenized_target = tokenizer.tokenize(" ".join(p_target)) param_dict = { 'tokenizer': tokenizer, 'maxlen': maxlen, 'handle_exceed': handle_exceed, 'reserved_len': reserved_len } # each word in sentence for j in range(1, len(tokenized_target) + 1): if "neg" in likelihood or 'both' in likelihood: # formatting neg data in csv if n_target is None: ntext_arr = [ tokenizer.convert_tokens_to_string(tokenized_target[:j - 1]) ] elif "[SEP]" in n_target: ntext_arr = [ ntext.strip() for ntext in n_target.split("[SEP]") ] else: ntext_arr = [n_target.strip()] # adding neg data for neg_text in ntext_arr: yield once.get_feature_from_data, { **{ 'input': input + " " + " ".join(tokenized_target[:j - 1]), 'target': tokenized_target[:j][-1], 'ntarget': neg_text, "add_end_tok": False }, **param_dict } else: yield get_feature_from_data, { **{ 'input': input, 'previous': tokenized_target[:j - 1], 'target': tokenized_target[:j], 'ntarget': None }, **param_dict } # end of the last word if "neg" in likelihood or 'both' in likelihood: # formatting neg data in csv if n_target is None: ntext_arr = [ tokenizer.convert_tokens_to_string(tokenized_target[:j - 1]) ] elif "[SEP]" in n_target: ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")] else: ntext_arr = [n_target.strip()] # adding neg data for neg_text in ntext_arr: yield get_feature_from_data, { **{ 'input': input, 'previous': tokenized_target, 'target': [tok.tok_sep(tokenizer)], 'ntarget': neg_text }, **param_dict } else: yield get_feature_from_data, { **{ 'input': input, 'previous': tokenized_target, 'target': [tok.tok_sep(tokenizer)], 'ntarget': None }, **param_dict } # whole sentence masking if 'pos' in likelihood: yield once.get_feature_from_data, { **{ 'input': input, 'target': " ".join(p_target) }, **param_dict } elif 'both' in likelihood or "neg" in likelihood: # formatting neg data in csv if n_target is None: ntext_arr = [ tokenizer.convert_tokens_to_string(tokenized_target[:j - 1]) ] elif "[SEP]" in n_target: ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")] else: ntext_arr = [n_target.strip()] for neg_text in ntext_arr: yield once.get_feature_from_data, { **{ 'input': input, 'target': " ".join(p_target), 'ntarget': neg_text }, **param_dict } return get_feature_from_data, param_dict
def get_feature_from_data(tokenizer, labels, input, target=None, maxlen=512, separator=" ", handle_exceed='slide'): feature_dict_list = [] mapping_index = [] pos = 1 # cls as start 0 for i in input.split(" "): for _ in range(len(tokenizer.tokenize(i))): if _ < 1: mapping_index.append({'char': i, 'pos': pos}) pos += 1 if target is not None: target = target.split(separator) t_input_list, t_pos_list = tok.handle_exceed(tokenizer, input, maxlen - 2, mode=handle_exceed, keep_after_sep=False) for t_input, t_pos in zip(t_input_list, t_pos_list): # -2 for cls and sep # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_sep(tokenizer)] input_id = tokenizer.convert_tokens_to_ids(tokenized_input) if target is not None: target_token = [] pev = 0 for tok_map, target_label in zip(mapping_index, target): if t_pos[0] < tok_map['pos'] <= t_pos[1]: for _ in range(tok_map['pos'] - pev): target_token += [labels.index(target_label)] pev = tok_map['pos'] if "O" in labels: target_id = [labels.index("O") ] + target_token + [labels.index("O")] else: target_id = [target_token[0] ] + target_token + [target_token[-1]] if len(input_id) != len(target_id): print("input target len not equal ", len(input_id), len(target_id)) target_id.extend([0] * (maxlen - len(target_id))) row_dict['target'] = target_id map_start = 0 map_end = len(mapping_index) for pos, tok_map in enumerate(mapping_index): if t_pos[0] == tok_map['pos']: map_start = pos elif t_pos[1] == tok_map['pos']: map_end = pos row_dict['mapping'] = mapping_index[map_start:map_end + 1] mask_id = [1] * len(input_id) mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['mask'] = mask_id row_dict['end'] = len(input_id) input_id.extend([0] * (maxlen - len(input_id))) row_dict['input'] = input_id row_dict['pos'] = [map_start, map_end] feature_dict_list.append(row_dict) return feature_dict_list