Exemplo n.º 1
0
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [48, 25, 21, 1289])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this",
            "▁is", "▁fal", "s", "é", "."
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and",
                "▁this", "▁is", "▁fal", "s", "<unk>", "."
            ],
        )
    def getRelations(self, tokenizer: AlbertTokenizer, tokenBatch,
                     spanBatch: torch.Tensor):
        # spanBatch[batchSize, 2, 3] # two most probable entities
        sentences = [None] * spanBatch.shape[0]
        sub_idx = [0] * spanBatch.shape[0]
        obj_idx = [0] * spanBatch.shape[0]

        markers = tokenizer.convert_tokens_to_ids(
            ["<e1>", "</e1>", "<e2>", "</e2>"])
        for (i, (tokens, spans)) in enumerate(zip(tokenBatch, spanBatch)):
            i1S = spans[0, 0]
            i1E = spans[0, 1]
            i2S = spans[1, 0]
            i2E = spans[1, 1]
            if i1S > i2S:
                temp = (i2S, i2E)
                (i2S, i2E) = (i1S, i1E)
                (i1S, i1E) = temp
            tokens: List = tokens
            tokens.insert(i2E, markers[3])
            tokens.insert(i2S, markers[2])
            tokens.insert(i1E, markers[1])
            tokens.insert(i1S, markers[0])
            sentences[i] = torch.tensor(tokens)
            sub_idx[i] = i1S
            obj_idx[i] = i2S
        input_ids = pad_sequence(sentences, batch_first=True)
        relations = self(
            input_ids=input_ids,
            token_type_ids=(input_ids == 0).type(torch.int),
            attention_mask=(input_ids != 0).type(torch.int),
            labels=None,
            sub_idx=sub_idx,
            obj_idx=obj_idx,
        )
        relations = relations.argmax(1)
        return (relations, [relationTypes[rel] for rel in relations])
Exemplo n.º 3
0
with open(os.path.join(args.data, 'ontonotes/const/pos/labels.txt')) as f:
  while True:
    pos = f.readline().rstrip()
    if pos == "": break
    tag[pos] = np.asarray([0, 0])

    
text_file = open(os.path.join(args.data, 'ontonotes/const/pos/conll-2012-test.json'), 'r')
for i, line in tqdm(enumerate(text_file.readlines())):
  data = json.loads(line)
  tokens = data['text'].split(' ')
  labels = data['targets']
  re_2_o = []
  retokenized = []
  for word_id, token in enumerate(tokens):
    token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(token))
    retokenized.extend(token)
    re_2_o.extend([word_id for _ in range(len(token))])
  model_inputs = []
  for span in labels:
    span1 = [] 
    indexed_tokens = retokenized.copy()
    for pos in range(span['span1'][0], span['span1'][1]):
      select = np.where(np.asarray(re_2_o) == pos)[0]
      span1.extend(select)
    for i in span1:
      indexed_tokens[i] = tokenizer.mask_token_id 
    indexed_tokens = [tokenizer.cls_token_id] + indexed_tokens + [tokenizer.sep_token_id]
    model_inputs.append(indexed_tokens)

  retokenized.insert(0, tokenizer.cls_token_id)