def test_package_name_like_string(self): reference = TokenizeQuery()('foo.bar') assert [ Token(None, "foo.bar", "foo.bar"), Token(None, "foo", "foo"), Token(None, "bar", "bar") ] == reference
def test_eval(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( "y = x + 1" ) transform = AddPreviousActions(aencoder) prev_action_tensor = transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [2, -1, -1], [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1], [1, -1, -1] ], prev_action_tensor.numpy() )
def test_eval(self): entries = [Environment( {"text_query": "foo bar", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( "y = x + 1" ) transform = AddActions(aencoder) action_tensor = transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [2, 2, 0], [4, 3, 1], [6, 4, 2], [6, 4, 2], [5, 3, 1], [6, 5, 5], [6, 5, 5], [5, 5, 5], [6, 4, 8], [6, 4, 8], [5, 5, 5], [9, 6, 11], [9, 6, 11], [-1, -1, -1] ], action_tensor.numpy() )
def test_decode(self): funcdef = ExpandTreeRule( NodeType("def", NodeConstraint.Node, False), [("name", NodeType("value", NodeConstraint.Token, True)), ("body", NodeType("expr", NodeConstraint.Node, True))]) expr = ExpandTreeRule( NodeType("expr", NodeConstraint.Node, False), [("op", NodeType("value", NodeConstraint.Token, True)), ("arg0", NodeType("value", NodeConstraint.Token, True)), ("arg1", NodeType("value", NodeConstraint.Token, True))]) encoder = ActionSequenceEncoder( Samples([funcdef, expr], [ NodeType("def", NodeConstraint.Node, False), NodeType("value", NodeConstraint.Token, True), NodeType("expr", NodeConstraint.Node, False) ], [("", "f")]), 0) action_sequence = ActionSequence() action_sequence.eval(ApplyRule(funcdef)) action_sequence.eval(GenerateToken("", "f")) action_sequence.eval(GenerateToken("", "1")) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) expected_action_sequence = ActionSequence() expected_action_sequence.eval(ApplyRule(funcdef)) expected_action_sequence.eval(GenerateToken("", "f")) expected_action_sequence.eval(GenerateToken("", "1")) expected_action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) result = encoder.decode( encoder.encode_action(action_sequence, [Token(None, "1", "1")])[:-1, 1:], [Token(None, "1", "1")]) assert \ expected_action_sequence.action_sequence == result.action_sequence
def test_subtokens(self): reference = TokenizeQuery()('foo.bar') assert [Token(None, "SUB_START", ""), Token(None, "foo", "foo"), Token(None, ".", "."), Token(None, "bar", "bar"), Token(None, "SUB_END", "")] == reference
def test_encode_action(self): funcdef = ExpandTreeRule( NodeType("def", NodeConstraint.Node, False), [("name", NodeType("value", NodeConstraint.Token, True)), ("body", NodeType("expr", NodeConstraint.Node, True))]) expr = ExpandTreeRule( NodeType("expr", NodeConstraint.Node, False), [("op", NodeType("value", NodeConstraint.Token, True)), ("arg0", NodeType("value", NodeConstraint.Token, True)), ("arg1", NodeType("value", NodeConstraint.Token, True))]) encoder = ActionSequenceEncoder( Samples([funcdef, expr], [ NodeType("def", NodeConstraint.Node, False), NodeType("value", NodeConstraint.Token, True), NodeType("expr", NodeConstraint.Node, True) ], [("", "f"), ("", "2")]), 0) action_sequence = ActionSequence() action_sequence.eval(ApplyRule(funcdef)) action_sequence.eval(GenerateToken("", "f")) action_sequence.eval(GenerateToken("", "1")) action_sequence.eval(GenerateToken("", "2")) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) action = encoder.encode_action( action_sequence, [Token("", "1", "1"), Token("", "2", "2")]) assert np.array_equal([[-1, 2, -1, -1], [2, -1, 1, -1], [2, -1, -1, 0], [2, -1, 2, 1], [2, 1, -1, -1], [3, -1, -1, -1]], action.numpy())
def test_happy_path(self): lexer = Lexer() assert lexer.tokenize_with_offset("int a = 0;") == [ (0, Token("INT", "int", "int")), (4, Token("ID", "a", "a")), (6, Token("EQUALS", "=", "=")), (8, Token("INT_CONST_OCT", "0", "0")), (9, Token("SEMI", ";", ";")) ]
def test_simple_case(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( ground_truth="y = x + 1" ) transform = EncodeActionSequence(aencoder) ground_truth = transform( action_sequence=action_sequence, reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], ) assert np.array_equal( [ [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1], [1, -1, -1] ], ground_truth.numpy() )
def unparse(self, code: AST) -> Optional[diffAST]: assert isinstance(code, Node) fields = {field.name: field.value for field in code.fields} if code.get_type_name() == "Diff": deltas = [ self.unparse(delta) for delta in cast(List[AST], fields["deltas"]) ] if None in deltas: return None return Diff(cast(List[Delta], deltas)) elif code.get_type_name() == "Insert": value = self.lexer.untokenize([ Token(None, cast(Leaf, v).value, cast(Leaf, v).value) for v in cast(List[AST], fields["value"]) ]) if value is None: return None return Insert(cast(Leaf, fields["line_number"]).value, value) elif code.get_type_name() == "Remove": return Remove(cast(Leaf, fields["line_number"]).value) elif code.get_type_name() == "Replace": value = self.lexer.untokenize([ Token(None, cast(Leaf, v).value, cast(Leaf, v).value) for v in cast(List[AST], fields["value"]) ]) if value is None: return None return Replace(cast(Leaf, fields["line_number"]).value, value) raise AssertionError(f"invalid node type_name: {code.get_type_name()}")
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddQueryForTreeGenDecoder(aencoder, 3,) query = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [-1, -1, -1], [2, -1, -1], [3, 2, -1], [4, 3, 2], [3, 2, -1], [5, 3, 2], [5, 3, 2], [4, 5, 3], [5, 3, 2], [6, 5, 3] ], query.numpy() )
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddActionSequenceAsTree(aencoder,) matrix, depth = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [0, 1, 2, 3, 2, 3, 3, 4, 3, 4], depth.numpy() ) assert np.array_equal( [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], matrix.numpy() )
def test_n_dependent(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddPreviousActionRules(aencoder, 2, n_dependent=3) prev_rule_action = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False, ) assert np.array_equal( [ # str -> "y" [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]], # Number -> number [[8, -1, -1], [9, -1, -1], [-1, -1, -1]], [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]], ], prev_rule_action.numpy() )
def test_invalid_program(self): lexer = Lexer() assert lexer.tokenize_with_offset("int a = 0") == [ (0, Token("INT", "int", "int")), (4, Token("ID", "a", "a")), (6, Token("EQUALS", "=", "=")), (8, Token("INT_CONST_OCT", "0", "0")), ]
def test_int_placeholder(self): lexer = Lexer() assert lexer.tokenize_with_offset("int a = 1;") == [ (0, Token("name", "___name@0___", "int")), (4, Token("name", "___name@1___", "a")), (6, Token("op", "=", "=")), (8, Token("number", "___number@0___", "1")), (9, Token("op", ";", ";")) ]
def test_float_placeholder(self): lexer = Lexer() assert lexer.tokenize_with_offset("float a = 1.0;") == [ (0, Token("name", "___name@0___", "float")), (6, Token("name", "___name@1___", "a")), (8, Token("op", "=", "=")), (10, Token("number", "___number@0___", "1.0")), (13, Token("op", ";", ";")) ]
def test_encode(): encoder = EncodeQuery(128) out = encoder(Environment( states={"reference": [Token(None, "print", "print"), Token(None, "___var@1___", "`x`")]} )) assert out.states["input_ids"].shape == (128, 50) assert out.states["segment_ids"].shape == (128,) assert out.states["input_mask"].shape == (128,) assert np.all(out.states["segment_ids"].numpy() == 0) assert np.all(out.states["input_mask"][:4].numpy() == 1) assert np.all(out.states["input_mask"][4:].numpy() == 0)
def test_quoted_string(self): reference = TokenizeQuery()('"quoted string" test') assert [ Token(None, "####0####", 'quoted string'), Token(None, "test", "test") ] == reference reference = TokenizeQuery()('"quoted string" "quoted string" test') assert [ Token(None, '####0####', "quoted string"), Token(None, "####0####", 'quoted string'), Token(None, "test", "test") ] == reference
def test_extractor(): encoder = EncodeQuery(128) extractor = Extractor() out = encoder(Environment( states={"reference": [Token(None, "print", "print"), Token(None, "___var@1___", "`x`")]} )) out.states["input_ids"] = out.states["input_ids"].unsqueeze(0) out.states["segment_ids"] = out.states["segment_ids"].unsqueeze(0) out.states["input_mask"] = out.states["input_mask"].unsqueeze(0) out = extractor(out) assert out.states["reference_features"].data.shape == (128, 1, 768) assert not out.states["reference_features"].data.requires_grad assert np.all(out.states["reference_features"].mask[:4].numpy() == 1) assert np.all(out.states["reference_features"].mask[4:].numpy() == 0)
def test_encode_invalid_sequence(self): funcdef = ExpandTreeRule( NodeType("def", NodeConstraint.Node, False), [("name", NodeType("value", NodeConstraint.Token, True)), ("body", NodeType("expr", NodeConstraint.Node, True))]) expr = ExpandTreeRule( NodeType("expr", NodeConstraint.Node, False), [("op", NodeType("value", NodeConstraint.Token, False)), ("arg0", NodeType("value", NodeConstraint.Token, True)), ("arg1", NodeType("value", NodeConstraint.Token, True))]) encoder = ActionSequenceEncoder( Samples([funcdef, expr], [ NodeType("def", NodeConstraint.Node, False), NodeType("value", NodeConstraint.Token, True), NodeType("expr", NodeConstraint.Node, True) ], [("", "f")]), 0) action_sequence = ActionSequence() action_sequence.eval(ApplyRule(funcdef)) action_sequence.eval(GenerateToken("", "f")) action_sequence.eval(GenerateToken("", "1")) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) assert encoder.encode_action(action_sequence, [Token("", "2", "2")]) is None
def test_encode_empty_sequence(self): funcdef = ExpandTreeRule( NodeType("def", NodeConstraint.Node, False), [("name", NodeType("value", NodeConstraint.Token, False)), ("body", NodeType("expr", NodeConstraint.Node, True))]) expr = ExpandTreeRule( NodeType("expr", NodeConstraint.Node, False), [("op", NodeType("value", NodeConstraint.Token, False)), ("arg0", NodeType("value", NodeConstraint.Token, False)), ("arg1", NodeType("value", NodeConstraint.Token, False))]) encoder = ActionSequenceEncoder( Samples([funcdef, expr], [ NodeType("def", NodeConstraint.Node, False), NodeType("value", NodeConstraint.Token, False), NodeType("expr", NodeConstraint.Node, False) ], [("", "f")]), 0) action_sequence = ActionSequence() action = encoder.encode_action(action_sequence, [Token("", "1", "1")]) parent = encoder.encode_parent(action_sequence) d, m = encoder.encode_tree(action_sequence) assert np.array_equal([[-1, -1, -1, -1]], action.numpy()) assert np.array_equal([[-1, -1, -1, -1]], parent.numpy()) assert np.array_equal(np.zeros((0, )), d.numpy()) assert np.array_equal(np.zeros((0, 0)), m.numpy())
def test_ast_set_sample(self): asts = ["c0", "c1", "c2"] sampler = SequentialProgramSampler(MockSynthesizer(asts), transform_input, Collate(), MockEncoder(), MockExpander(), MockInterpreter()) zero = SamplerState(0, sampler.initialize([(None, None)])) samples = list(sampler.batch_k_samples([zero], [3])) samples.sort(key=lambda x: -x.state.score) assert 3 == len(samples) assert samples[0] == DuplicatedSamplerState( SamplerState( 1, Environment({ "test_cases": [(None, None)], "reference": [Token(None, str(asts[0]), str(asts[0]))], "variables": [["#" + str(asts[0])]], "interpreter_state": BatchedState({str(asts[0]): None}, {str(asts[0]): ["#" + str(asts[0])]}, [str(asts[0])], ["#" + str(asts[0])]) })), 1) assert DuplicatedSamplerState( SamplerState( 0.5, Environment({ "test_cases": [(None, None)], "reference": [Token(None, str(asts[1]), str(asts[1]))], "variables": [["#" + str(asts[1])]], "interpreter_state": BatchedState({str(asts[1]): None}, {str(asts[1]): ["#" + str(asts[1])]}, [str(asts[1])], ["#" + str(asts[1])]) })), 1) == samples[1] assert DuplicatedSamplerState( SamplerState( 1.0 / 3, Environment({ "test_cases": [(None, None)], "reference": [Token(None, str(asts[2]), str(asts[2]))], "variables": [["#" + str(asts[2])]], "interpreter_state": BatchedState({str(asts[2]): None}, {str(asts[2]): ["#" + str(asts[2])]}, [str(asts[2])], ["#" + str(asts[2])]) })), 1) == samples[2]
def test_encode_each_action(self): funcdef = ExpandTreeRule( NodeType("def", NodeConstraint.Node, False), [("name", NodeType("value", NodeConstraint.Token, True)), ("body", NodeType("expr", NodeConstraint.Node, True))]) expr = ExpandTreeRule( NodeType("expr", NodeConstraint.Node, False), [("constant", NodeType("value", NodeConstraint.Token, True))]) encoder = ActionSequenceEncoder( Samples([funcdef, expr], [ NodeType("def", NodeConstraint.Node, False), NodeType("value", NodeConstraint.Token, True), NodeType("expr", NodeConstraint.Node, False), NodeType("expr", NodeConstraint.Node, True) ], [("", "f"), ("", "2")]), 0) action_sequence = ActionSequence() action_sequence.eval(ApplyRule(funcdef)) action_sequence.eval(GenerateToken("", "f")) action_sequence.eval(GenerateToken("", "1")) action_sequence.eval(GenerateToken("", "2")) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) action_sequence.eval(ApplyRule(expr)) action_sequence.eval(GenerateToken("", "f")) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) action_sequence.eval(ApplyRule(CloseVariadicFieldRule())) action = encoder.encode_each_action( action_sequence, [Token("", "1", "1"), Token("", "2", "2")], 1) assert np.array_equal( np.array( [ [[1, -1, -1], [2, -1, -1]], # funcdef [[-1, -1, -1], [-1, 1, -1]], # f [[-1, -1, -1], [-1, -1, 0]], # 1 [[-1, -1, -1], [-1, 2, 1]], # 2 [[-1, -1, -1], [-1, -1, -1]], # CloseVariadicField [[3, -1, -1], [2, -1, -1]], # expr [[-1, -1, -1], [-1, 1, -1]], # f [[-1, -1, -1], [-1, -1, -1]], # CloseVariadicField [[-1, -1, -1], [-1, -1, -1]] # CloseVariadicField ], dtype=np.long), action.numpy())
def test_impossible_case(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) d.tokens = [("", "y"), ("", "1")] aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( ground_truth="y = x + 1" ) transform = EncodeActionSequence(aencoder) with pytest.raises(RuntimeError): transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, )
def tokenize_with_offset(self, value): if value == "\n": return None retval = [] offset = 0 while True: next_sp = value.find(" ", offset) next_nl = value.find("\n", offset) if next_sp == -1 and next_nl == -1: retval.append( (offset, Token(None, value[offset:], value[offset:]))) break next_sp = next_sp if next_sp >= 0 else len(value) + 1 next_nl = next_nl if next_nl >= 0 else len(value) + 1 next = min(next_sp, next_nl) v = value[offset:next] retval.append((offset, Token(None, v, v))) offset = next + 1 return retval
def test_multiple_lines(self): lexer = Lexer() assert lexer.tokenize_with_offset("int a = 0\nint b;") == [ (0, Token("INT", "int", "int")), (4, Token("ID", "a", "a")), (6, Token("EQUALS", "=", "=")), (8, Token("INT_CONST_OCT", "0", "0")), (10, Token("INT", "int", "int")), (14, Token("ID", "b", "b")), (15, Token("SEMI", ";", ";")) ]
def __call__(self, env): # Encode for CharacterBERT env.states["reference"] = \ [Token(None, "[CLS]", "[CLS]")] + env.states["reference"] + \ [Token(None, "[SEP]", "[SEP]")] tokens = [token.raw_value for token in env.states["reference"]] segmend_ids = [0] * len(tokens) input_ids = self.indexer.as_padded_tensor( [tokens], maxlen=self.max_query_length)[0] input_mask = [1] * len(tokens) # Zero pad padding_length = self.max_query_length - len(input_mask) input_mask += [0] * padding_length segmend_ids += [0] * padding_length env.states["input_ids"] = input_ids env.states["input_mask"] = torch.tensor(input_mask) env.states["segment_ids"] = torch.tensor(segmend_ids) return env
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddPreviousActionRules(aencoder, 2) prev_rule_action = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ # None -> Root [[1, -1, -1], [2, -1, -1], [-1, -1, -1]], # Assign -> Name, expr [[3, -1, -1], [4, -1, -1], [5, -1, -1]], # Name -> str [[4, -1, -1], [6, -1, -1], [-1, -1, -1]], # str -> "x" [[-1, -1, -1], [-1, 1, -1], [-1, -1, -1]], # Op -> str, expr, expr [[7, -1, -1], [6, -1, -1], [5, -1, -1]], # str -> "+" [[-1, -1, -1], [-1, 2, -1], [-1, -1, -1]], # Name -> str [[4, -1, -1], [6, -1, -1], [-1, -1, -1]], # str -> "y" [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]], # Number -> number [[8, -1, -1], [9, -1, -1], [-1, -1, -1]], [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]], ], prev_rule_action.numpy() )
def tokenize_with_offset(self, code: str) \ -> Optional[List[Tuple[int, Token[str, str]]]]: code = code.replace("\r", "") lexer = CLexer(logger.warning, lambda: None, lambda: None, lambda x: False) lexer.build(optimize=False) lexer.input(code) tokens: List[LexToken] = list(iter(lexer.token, None)) return [(token.lexpos, Token(token.type, token.value, token.value)) for token in tokens]
def test_id_placeholder(self): lexer = Lexer() assert lexer.tokenize_with_offset("int a;a;b;") == [ (0, Token("name", "___name@0___", "int")), (4, Token("name", "___name@1___", "a")), (5, Token("op", ";", ";")), (6, Token("name", "___name@1___", "a")), (7, Token("op", ";", ";")), (8, Token("name", "___name@2___", "b")), (9, Token("op", ";", ";")) ]
def __call__(self, query: str) -> List[Token]: """ Tokenize query Parameters ---------- query: str Returns ------- List[Token] """ # Preprocess annotation def placeholder(id: int) -> str: return f"####{id}####" # Replace quoted string literals with placeholders mappings: Dict[str, str] = {} word_to_placeholder: Dict[str, str] = {} literal = r'\'\\\'\'|\"[^\"]+\"|\'[^\']+\'|`[^`]+`|"""[^"]+"""' while True: m = re.search(literal, query) if m is None: break w = m.group(0)[1:len(m.group(0)) - 1] if str(w) in word_to_placeholder: p = word_to_placeholder[str(w)] else: p = placeholder(len(mappings)) query = query[:m.start()] + p + query[m.end():] assert (not ("####" in w)) mappings[p] = str(w) word_to_placeholder[str(w)] = p reference = [] for word in tokenizer.tokenize(query): if word in mappings: reference.append(Token[str, str](None, word, mappings[word])) else: reference.append(Token[str, str](None, word, word)) vars = list(filter(lambda x: len(x) > 0, word.split('.'))) # split by '.' if len(vars) > 1: for v in vars: reference.append(Token(None, v, v)) return reference