def __init__(self, vocab_path, label_map_config=None, max_seq_len=512, do_lower_case=True, in_tokens=False, is_inference=False, learning_strategy='pointwise', random_seed=None, tokenizer="FullTokenizer", phase='train', is_classify=True, is_regression=False, for_cn=True, task_id=0, is_tsv=True): assert phase in ['train', 'predict'], "supported phase: train, predict." self.max_seq_len = max_seq_len self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.mask_id = self.vocab["[MASK]"] self.in_tokens = in_tokens self.phase = phase self.is_inference = is_inference self.learning_strategy = learning_strategy self.for_cn = for_cn self.task_id = task_id self.is_tsv = is_tsv np.random.seed(random_seed) self.is_classify = is_classify self.is_regression = is_regression self.current_example = 0 self.current_epoch = 0 self.num_examples = 0 self.examples = {} if label_map_config: with open(label_map_config, encoding='utf8') as f: self.label_map = json.load(f) if six.PY2: self.label_map = unicode_convert(self.label_map) else: self.label_map = None
def __init__(self, vocab_path, label_map_config=None, max_seq_len=512, do_lower_case=True, in_tokens=False, random_seed=None, tokenizer="FullTokenizer", is_classify=True, is_regression=False, for_cn=True, task_id=0, doc_stride=128, max_query_length=64, remove_noanswer=True): self.max_seq_len = max_seq_len self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.in_tokens = in_tokens self.for_cn = for_cn self.task_id = task_id self.doc_stride = doc_stride self.max_query_length = max_query_length self.examples = {} self.features = {} self.remove_noanswer = remove_noanswer if random_seed is not None: np.random.seed(random_seed) self.current_example = 0 self.current_epoch = 0 self.num_examples = 0 self.Example = namedtuple('Example', [ 'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', 'start_position', 'end_position' ]) self.Feature = namedtuple("Feature", [ "unique_id", "example_index", "doc_span_index", "tokens", "token_to_orig_map", "token_is_max_context", "token_ids", "position_ids", "text_type_ids", "start_position", "end_position" ]) self.DocSpan = namedtuple("DocSpan", ["start", "length"])