Exemplo n.º 1
0
 def __init__(self,
              data_dir,
              entity_vocab,
              type_vocab,
              max_column=10,
              max_input_tok=500,
              src="train",
              max_length=[50, 10, 10],
              force_new=False,
              tokenizer=None):
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = BertTokenizer.from_pretrained(
             'data/pre-trained_models/bert-base-uncased')
     self.src = src
     self.force_new = force_new
     self.max_input_tok = max_input_tok
     self.max_title_length = max_length[0]
     self.max_header_length = max_length[1]
     self.max_cell_length = max_length[2]
     self.max_column = max_column
     self.entity_vocab = entity_vocab
     self.entity_wikid2id = {
         self.entity_vocab[x]['wiki_id']: x
         for x in self.entity_vocab
     }
     self.type_vocab = type_vocab
     self.type_num = len(self.type_vocab)
     self.data = self._preprocess(data_dir)
Exemplo n.º 2
0
 def __init__(self,
              data_dir,
              entity_vocab,
              max_input_tok=500,
              src="train",
              max_cell_length=10,
              force_new=False,
              tokenizer=None):
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = BertTokenizer.from_pretrained(
             'data/pre-trained_models/bert-base-uncased')
     self.src = src
     self.force_new = force_new
     self.max_input_tok = max_input_tok
     self.max_cell_length = max_cell_length
     self.entity_vocab = entity_vocab
     self.entity_wiktitle2id = {
         self.entity_vocab[x]['wiki_title']: x
         for x in self.entity_vocab
     }
     self.class_vocab, self.class_mask, self.data = self._preprocess(
         data_dir)
     self.class_num = len(self.class_vocab)
Exemplo n.º 3
0
 def __init__(self, data_dir, max_input_tok=500, src="train", max_length = [50, 50, 10, 10], force_new=False, tokenizer = None, for_bert=False):
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = BertTokenizer.from_pretrained('data/pre-trained_models/bert-base-uncased')
     self.src = src
     self.for_bert = for_bert
     self.max_query_length = max_length[0]
     self.max_title_length = max_length[1]
     self.max_header_length = max_length[2]
     self.max_cell_length = max_length[3]
     self.force_new = force_new
     self.max_input_tok = max_input_tok
     self.data = self._preprocess(data_dir)
Exemplo n.º 4
0
 def __init__(self, data_dir, ent_type_vocab, max_input_tok=500, src="train", max_length = [50, 10, 10, 100], force_new=False, tokenizer = None):
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = BertTokenizer.from_pretrained('data/pre-trained_models/bert-base-uncased')
     self.src = src
     self.force_new = force_new
     self.max_input_tok = max_input_tok
     self.max_title_length = max_length[0]
     self.max_header_length = max_length[1]
     self.max_cell_length = max_length[2]
     self.max_description_length = max_length[3]
     self.ent_type_vocab = ent_type_vocab
     self.ent_type_num = len(self.ent_type_vocab)
     self.data = self._preprocess(data_dir)
Exemplo n.º 5
0
 def __init__(self,
              data_dir,
              max_input_tok=500,
              src="train",
              max_length=[50, 10],
              force_new=False,
              tokenizer=None):
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = BertTokenizer.from_pretrained(
             'data/pre-trained_models/bert-base-uncased')
     self.src = src
     self.force_new = force_new
     self.max_input_tok = max_input_tok
     self.max_title_length = max_length[0]
     self.max_header_length = max_length[1]
     self.header_vocab = self.load_header_vocab(data_dir)
     self.header2id = {self.header_vocab[x]: x for x in self.header_vocab}
     self.data = self._preprocess(data_dir)