def process_query(query, code): from parse import code_to_ast, ast_to_tree, tree_to_ast, parse import astor str_count = 0 str_map = dict() match_count = 1 match = QUOTED_STRING_RE.search(query) while match: str_repr = '_STR:%d_' % str_count str_literal = match.group(0) str_string = match.group(2) match_count += 1 # if match_count > 50: # return # query = QUOTED_STRING_RE.sub(str_repr, query, 1) str_map[str_literal] = str_repr str_count += 1 match = QUOTED_STRING_RE.search(query) code = code.replace(str_literal, '\'' + str_repr + '\'') # clean the annotation # query = query.replace('.', ' . ') for k, v in str_map.iteritems(): if k == '\'%s\'' or k == '\"%s\"': query = query.replace(v, k) code = code.replace('\'' + v + '\'', k) # tokenize query_tokens = nltk.word_tokenize(query) new_query_tokens = [] # break up function calls for token in query_tokens: new_query_tokens.append(token) i = token.find('.') if 0 < i < len(token) - 1: new_tokens = ['['] + token.replace('.', ' . ').split(' ') + [']'] new_query_tokens.extend(new_tokens) # check if the code compiles tree = parse(code) ast_tree = tree_to_ast(tree) astor.to_source(ast_tree) return new_query_tokens, code, str_map