def create_token_mask(self, s):
     if len(s) == 0:
         # print('token set mask is empty')
         # info('token set mask is empty')
         return generate_mask([(0, self.vocab.vocabulary_size-1)], size=self.vocab.vocabulary_size)
     # mask = create_token_mask_by_token_set(s, vocab_len=self.vocab.vocabulary_size)
     mask = generate_mask(s, size=self.vocab.vocabulary_size)
     return mask
Пример #2
0
    def __call__(self, sample):
        """
        :param sample: a list of node
        :return: a dict of list {'to_parse_token', 'terminal_mask'}
        """
        production_vocabulary = self._production_vocabulary

        get_token_id = production_vocabulary.get_token_id
        get_matched_terminal_index = production_vocabulary.get_matched_terminal_node
        vocabulary_size = production_vocabulary.token_num()
        generate_mask_fn = generate_mask(size=vocabulary_size)
        get_node_right_id = lambda x: x.right_id

        stack = [get_token_id(production_vocabulary.EMPTY), sample[0].left_id]
        to_parse_token_id = [sample[0].left_id]

        for node in sample:
            type_id = stack.pop()
            if isinstance(node, LeafToken):
                # print("Terminal token:{}".format(node.value))
                to_parse_token_id.append(stack[-1])
            else:
                assert type_id == node.left_id
                for right_id in reversed(get_node_right_id(node)):
                    stack.append(right_id)

        terminal_mask = [
            generate_mask_fn(get_matched_terminal_index(token))
            for token in to_parse_token_id
        ]
        return {
            "to_parse_token": to_parse_token_id,
            "terminal_mask": terminal_mask
        }
 def _generate_terminal_mask(
     self,
     terminal_label_index,
 ):
     size = self._keyword_num
     token_index_set = set()
     keyword_map = pre_defined_c_tokens_map
     has_identifer = 0
     for t in terminal_label_index:
         token_str = self._label_vocabulary.get_label_by_id(t)
         if token_str == "ID":
             has_identifer = 1
         elif token_str == "TYPEID":
             has_identifer = 1
         elif token_str == "IMAGINARY_":
             pass
         elif token_str == "END_OF_SLK_INPUT":
             token_index_set.add(self._token_vocabulary.word_to_id(END))
         elif token_str == "CONSTANT" or token_str == "STRING_LITERAL":
             token_index_set.add(
                 self._token_vocabulary.word_to_id(token_str))
         else:
             token_index_set.add(
                 self._token_vocabulary.word_to_id(keyword_map[token_str]))
     return generate_mask(token_index_set, size).flip(), 1 - has_identifer
Пример #4
0
    def __init__(self,
                 vocab_size,
                 type_num,
                 embedding_dim,
                 hidden_state_size,
                 rnn_num_layers,
                 identifier_index,
                 keyword_index,
                 terminal_token_index,
                 batch_size):
        super().__init__()
        self._batch_size = batch_size
        self._rnn_num_layers = rnn_num_layers
        self._hidden_state_size = hidden_state_size

        self.token_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.type_embedding = nn.Embedding(type_num, embedding_dim, sparse=True).cuda(GPU_INDEX)
        self.rnn = nn.LSTM(input_size=embedding_dim,
                           hidden_size=hidden_state_size,
                           num_layers=rnn_num_layers,).cuda(GPU_INDEX)

        self.token_prob_mlp = nn.Sequential(
            nn.Linear(hidden_state_size+embedding_dim, hidden_state_size),
            nn.ReLU(),
            nn.Linear(hidden_state_size, type_num)).cuda(GPU_INDEX)

        # self.type_feature_mlp = nn.Sequential(
        #     nn.Linear(embedding_dim, hidden_state_size),
        #     nn.ReLU(),
        #     nn.Linear(hidden_state_size, vocab_size)
        # ).cuda(GPU_INDEX)

        self._initial_state = self.initial_state()
        # self._all_type_index = torch.range(0, type_num-1).type(torch.LongTensor).cuda(GPU_INDEX)
        self._identifier_index = torch.LongTensor([identifier_index]).cuda(GPU_INDEX)
        self._terminal_token_without_identifier_index = torch.LongTensor(
            sorted(list(set(range(type_num)) - {identifier_index}))).cuda(GPU_INDEX)
        self._keyword_index = torch.LongTensor(keyword_index).cuda(GPU_INDEX)
        self._identifier_token_mask = torch.FloatTensor(
            generate_mask(set(range(vocab_size)) - set(keyword_index), vocab_size)).cuda(GPU_INDEX)
        self.stable_log_fn = create_stable_log_fn(1e-7)
        self._terminal_token_index = torch.LongTensor(
            sorted(list(set(terminal_token_index)-{identifier_index}))).cuda(GPU_INDEX)

        self.rnn_feature_mlp = nn.Sequential(
            nn.Linear(hidden_state_size, hidden_state_size),
            nn.ReLU(),
            nn.Linear(hidden_state_size, vocab_size -len(self._terminal_token_index)),
        ).cuda(GPU_INDEX)
Пример #5
0
    def __call__(self, sample):
        """
        :param sample: a list of node
        :return: a dict of list {'to_parse_token', 'terminal_mask'}
        """
        # print()
        production_vocabulary = self._production_vocabulary

        get_matched_terminal_index = production_vocabulary.get_matched_terminal_node
        vocabulary_size = production_vocabulary.token_num()
        generate_mask_fn = generate_mask(size=vocabulary_size)
        get_node_right_id = lambda x: x.right_id

        stack = [production_vocabulary.EMPTY_id, sample[0].left_id]
        string_stack = ["EMPTY", sample[0].left]
        to_parse_token_id = [sample[0].left_id]
        now_id = 0
        peeked_max_id = -1

        sample = list(filter(lambda x: not(isinstance(x, LeafToken) and not production_vocabulary.is_token(x.type_id)),
                             sample))

        tokens = []
        for node in sample:
            if isinstance(node, LeafToken) and production_vocabulary.is_token(node.type_id):
                tokens.append(node.type_id)

        peeked_compact_dict = {}

        for node in sample:
            # print(node)
            type_id = stack.pop()
            type_string = string_stack.pop()
            # print("The stack popped token is:{}, string:{}".format(type_id, type_string))
            if isinstance(node, LeafToken):
                # print("Terminal token:{}".format(node.value))
                if production_vocabulary.is_token(node.type_id):
                    now_id +=1
                    to_parse_token_id.append(stack[-1])
            else:
                assert type_id == node.left_id, "type string is {}, now left is {}".format(type_string, node.left)
                if now_id < len(tokens) and production_vocabulary.need_peek(type_id, tokens[now_id]):
                    # print("need peek")
                    level = 1
                    entry = production_vocabulary.get_parse_entry(type_id, tokens[now_id])
                    peeked_id = now_id + level
                    if peeked_id not in peeked_compact_dict:
                        peeked_compact_dict[peeked_id] = production_vocabulary.get_conflict_matched_terminal_node(entry)
                    while production_vocabulary.need_peek(entry, tokens[peeked_id], True):
                        entry = production_vocabulary.get_conflict_entry(entry, tokens[peeked_id])
                        peeked_id += 1
                        if peeked_id not in peeked_compact_dict:
                            peeked_compact_dict[peeked_id] = production_vocabulary.get_conflict_matched_terminal_node(
                                entry)
                    peeked_max_id = max(peeked_max_id, peeked_id)

                for i, right_id in reversed(list(enumerate(get_node_right_id(node)))):
                    if production_vocabulary.is_token(right_id):
                        stack.append(right_id)
                        string_stack.append(node.right[i])
                    else:
                        # print("{} with id {} is not a token".format(node.right[i], right_id))
                        pass

        terminal_mask = []
        for i, token in enumerate(to_parse_token_id):
            if i in peeked_compact_dict:
                # print("peek", peeked_compact_dict[i])
                terminal_mask.append(generate_mask_fn(peeked_compact_dict[i]))
            else:
                # print("terminal", get_matched_terminal_index(token))
                terminal_mask.append(generate_mask_fn(get_matched_terminal_index(token)))

        return {"to_parse_token": to_parse_token_id, "terminal_mask": terminal_mask,}
Пример #6
0
 def setUp(self):
     self._mask_list = generate_mask([1, (4, 8), 9], 10)
     self._list = [0, 1, 0, 0, 1, 1, 1, 1, 1, 1]
 def __call__(self, sample):
     return [generate_mask([t], self._size) for t in sample]
    def __call__(self, sample):
        """
        :param sample: a dict {"tree": a list of node, }
        :return: a dict of list {'to_parse_token', 'terminal_mask'}
        """
        # print()
        # consistent_identifier = sample["consistent_identifier"]
        # consistent_typename = sample["consistent_typename"]
        target = sample['target']
        target_string = [self._token_vocabulary.id_to_word(t) for t in target]
        sample = sample["tree"]

        production_vocabulary = self._production_vocabulary

        get_matched_terminal_index = production_vocabulary.get_matched_terminal_node
        vocabulary_size = production_vocabulary.token_num()
        generate_mask_fn = generate_mask(size=vocabulary_size)
        get_node_right_id = lambda x: x.right_id

        stack = [production_vocabulary.EMPTY_id, sample[0].left_id]
        string_stack = ["EMPTY", sample[0].left]
        to_parse_token_id = [sample[0].left_id]
        now_id = 0
        peeked_max_id = -1

        sample = list(
            filter(
                lambda x: not (isinstance(x, LeafToken) and
                               not production_vocabulary.is_token(x.type_id)),
                sample))

        tokens = []
        for node in sample:
            if isinstance(node, LeafToken) and production_vocabulary.is_token(
                    node.type_id):
                tokens.append(node.type_id)

        peeked_compact_dict = {}

        for node in sample:
            # print(node)
            type_id = stack.pop()
            type_string = string_stack.pop()
            # print("The stack popped token is:{}, string:{}".format(type_id, type_string))
            if isinstance(node, LeafToken):
                # print("Terminal token:{}".format(node.value))
                if production_vocabulary.is_token(node.type_id):
                    now_id += 1
                    to_parse_token_id.append(stack[-1])
            else:
                assert type_id == node.left_id, "type string is {}, now left is {}".format(
                    type_string, node.left)
                # print(node.left)
                if now_id < len(tokens) and production_vocabulary.need_peek(
                        type_id, tokens[now_id]):
                    # print("need peek")
                    level = 1
                    entry = production_vocabulary.get_parse_entry(
                        type_id, tokens[now_id])
                    # print("entry is:{}".format(entry))
                    peeked_id = now_id + level
                    if peeked_id not in peeked_compact_dict:
                        # print("token {} need peek after token {} saw".format(target_string[peeked_id],  target_string[now_id]))
                        peeked_compact_dict[
                            peeked_id] = production_vocabulary.get_conflict_matched_terminal_node(
                                entry)
                        # print("token {} in peeked_dict? {}".format(target_string[peeked_id], tokens[peeked_id] in peeked_compact_dict[peeked_id]))
                    while production_vocabulary.need_peek(
                            entry, tokens[peeked_id], True):
                        entry = production_vocabulary.get_conflict_entry(
                            entry, tokens[peeked_id])
                        # print("now entry:{}".format(entry))
                        peeked_id += 1
                        # print("token {} need peek after token {} saw".format(target_string[peeked_id],  target_string[now_id]))
                        if peeked_id not in peeked_compact_dict:
                            peeked_compact_dict[
                                peeked_id] = production_vocabulary.get_conflict_matched_terminal_node(
                                    entry)
                    peeked_max_id = max(peeked_max_id, peeked_id)

                for i, right_id in reversed(
                        list(enumerate(get_node_right_id(node)))):
                    if production_vocabulary.is_token(right_id):
                        stack.append(right_id)
                        string_stack.append(node.right[i])
                    else:
                        # print("{} with id {} is not a token".format(node.right[i], right_id))
                        pass

        terminal_mask_index = []
        for (i, token), t in zip(enumerate(to_parse_token_id), target):
            if i in peeked_compact_dict:
                # print("peek", peeked_compact_dict[i])
                # print("target {} use peek".format(self._token_vocabulary.id_to_word(t)))
                terminal_mask_index.append(peeked_compact_dict[i])
            else:
                # print("terminal", get_matched_terminal_index(token))
                # print("target {} use get matched".format(self._token_vocabulary.id_to_word(t)))
                terminal_mask_index.append(get_matched_terminal_index(token))

        terminal_mask = [
            self._generate_terminal_mask(index)
            for index in terminal_mask_index
        ]
        from toolz.sandbox import unzip
        terminal_mask, has_identifier = unzip(terminal_mask)
        terminal_mask = list(terminal_mask)
        for t in terminal_mask:
            assert len(t) == self._keyword_num
        has_identifier = list(has_identifier)

        prev_tokens = []
        for t, mask, index, h_i in zip(target, terminal_mask,
                                       terminal_mask_index, has_identifier):
            if t < self._keyword_num:
                if mask[t] != 0:
                    # print("The code before: {}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in prev_tokens])))
                    # print("all code:{}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in target])))
                    msg = "target {} not in the mask".format(
                        self._token_vocabulary.id_to_word(t))
                    raise ValueError(msg)
            elif h_i != 0:
                # print("The code before: {}".format(
                #     " ".join([self._token_vocabulary.id_to_word(to) for to in prev_tokens])))
                # print("all code:{}".format(" ".join([self._token_vocabulary.id_to_word(to) for to in target])))
                msg = "target {} not in the mask".format(
                    self._token_vocabulary.id_to_word(t))
                raise ValueError(msg)
            else:
                prev_tokens.append(t)

        return {
            "terminal_mask": terminal_mask,
            "target": target,
            "has_identifier": has_identifier
        }
 def _g_map(self, sample: int):
     if sample > self._size:
         raise ValueError(
             "The range mask out of range, with size {} and sample {}".
             format(self._size, sample))
     return generate_mask(range(sample), self._size)