def get_matched_entries(s, field_values, m_theta=0.85, s_theta=0.85): if not field_values: return None if isinstance(s, str): n_grams = split(s) else: n_grams = s matched = dict() for field_value in field_values: if not isinstance(field_value, string_types): continue fv_tokens = split(field_value) sm = difflib.SequenceMatcher(None, n_grams, fv_tokens) match = sm.find_longest_match(0, len(n_grams), 0, len(fv_tokens)) if match.size > 0: source_match = get_effecitve_match_source(n_grams, match.a, match.a + match.size) if source_match and source_match.size > 1: match_str = field_value[match.b:match.b + match.size] source_match_str = s[source_match.start:source_match.start + source_match.size] c_match_str = match_str.lower().strip() c_source_match_str = source_match_str.lower().strip() c_field_value = field_value.lower().strip() if c_match_str and not utils.is_number( c_match_str) and not utils.is_common_db_term( c_match_str): if utils.is_stopword(c_match_str) or utils.is_stopword(c_source_match_str) or \ utils.is_stopword(c_field_value): continue if c_source_match_str.endswith(c_match_str + '\'s'): match_score = 1.0 else: if prefix_match(c_field_value, c_source_match_str): match_score = fuzz.ratio(c_field_value, c_source_match_str) / 100 else: match_score = 0 if (utils.is_commonword(c_match_str) or utils.is_commonword(c_source_match_str) or utils.is_commonword(c_field_value) ) and match_score < 1: continue s_match_score = match_score if match_score >= m_theta and s_match_score >= s_theta: if field_value.isupper( ) and match_score * s_match_score < 1: continue matched[match_str] = (field_value, source_match_str, match_score, s_match_score, match.size) if not matched: return None else: return sorted(matched.items(), key=lambda x: (1e16 * x[1][2] + 1e8 * x[1][3] + x[1][4]), reverse=True)
def is_numeric_field(self, s): assert(isinstance(s, string_types)) if is_number(s): return False assert(re.fullmatch(utils.field_pattern, s)) field_id = self.schema.get_field_id(s) field_node = self.schema.get_field(field_id) return field_node.is_numeric
def MaxAmount(amount): if (isinstance(amount, str) and is_number(amount)): _amount = safe_int(Number(amount) * (1.0001)) return str(_amount) if (isinstance(amount, dict) and utils.isValidAmount(amount)): _value = Number(amount['value']) * (1.0001) amount['value'] = str(_value) return amount return Exception('invalid amount to max')
def is_field(self, s): if not isinstance(s, string_types): return False if is_number(s): return False if re.fullmatch(utils.field_pattern, s): table_name, field_name = s.split('.') if re.fullmatch(utils.alias_pattern, table_name): table_name = self.get_table_name_by_alias(table_name) return self.schema.is_table_name(table_name) and self.schema.is_field_name(field_name) else: return self.schema.is_field_name(s)
def Number(num): if (not is_number(num)): return float('nan') if (isinstance(num, bool) and num): return 1 if (isinstance(num, bool) and not num): return 0 if (isinstance(num, (int, float))): return num if '.' in num: return float(num) else: return int(num)
def extract_value_spans(program_tokens, program_token_types, tu): values = [] value, is_value = [], False for t, t_type in zip(program_tokens, program_token_types): if t_type == sql_tokenizer.VALUE: value.append(t) else: if value: value_str = tu.tokenizer.convert_tokens_to_string(value) value_str = value_str.replace(' . ', '.') value_str = value_str.replace(' @ ', '@') value_str = value_str.replace(' - ', '-') if not utils.is_number(value_str): values.append(value_str) value = [] return values
def dispatch(self, json, is_table=False, should_quote=should_quote): if isinstance(json, list): return self.delimited_list(json) if isinstance(json, dict): if len(json) == 0: return [], [] elif 'value' in json: return self.value(json) elif 'from' in json: # Nested query 'from' return add_parentheses(self.tokenize(json)) elif 'query' in json: # Nested query 'query' nested_query_tokens = self.tokenize(json['query']) if 'name' in json: return connect_by_keywords( add_parentheses(nested_query_tokens), self.dispatch(json['name'], is_table=True), ['AS']) else: return add_parentheses(nested_query_tokens) elif 'union' in json: # Nested query 'union' return add_parentheses(self.union(json['union'])) elif 'intersect' in json: return add_parentheses(self.intersect(json['intersect'])) elif 'except' in json: return add_parentheses(self.except_(json['except'])) else: return self.op(json) if not isinstance(json, string_types): json = text(json) if is_table and json.lower() == 't0': return self.value_tokenize(json), [RESERVED_TOKEN, RESERVED_TOKEN] if self.keep_singleton_fields and (is_table or self.is_field(json) or json == '*'): if is_table: return [json], [TABLE] else: return [json], [FIELD] if self.atomic_value: self.constants.append(escape(json, self.value_tokenize, self.ansi_quotes, never)) if is_number(json): return [self.num_token], [VALUE] else: return [self.str_token], [VALUE] else: return escape(json, self.value_tokenize, self.ansi_quotes, should_quote)
def func(self, json): if op in ['<>', '>', '<', '>=', '<=', '=', '!='] and \ isinstance(json[0], string_types) and \ (isinstance(json[1], string_types) or (isinstance(json[1], dict) and 'literal' in json[1])): assert (len(json) == 2 and isinstance(json, list)) v1, v2 = json if isinstance(v2, dict): v2 = v2['literal'] if is_number(v2): return if v1 != v2: if self.is_field(v1) and not self.is_field(v2): v1_id = self.schema.get_field_id(v1) v1 = self.schema.get_field_signature(v1_id) self.values.append((v1, v2)) else: for v in json: self.dispatch(v)
def _literal(self, json): if isinstance(json, list): return add_parentheses( (functools.reduce(lambda x, y: x+y, [self._literal(v)[0] for v in json]), functools.reduce(lambda x, y: x+y, [self._literal(v)[1] for v in json]))) elif isinstance(json, string_types): if self.atomic_value: self.constants.append(escape(json, self.value_tokenize, self.ansi_quotes, never)) if is_number(json): return [self.num_token], [VALUE] else: return [self.str_token], [VALUE] else: return escape(json, self.value_tokenize, self.ansi_quotes, always) else: tokens = self.value_tokenize(text(json)) token_types = [VALUE for _ in tokens] return tokens, token_types
def preprocess_example(split, example, args, parsed_programs, text_tokenize, program_tokenize, post_process, table_utils, schema_graph, vocabs, verbose=False): tu = table_utils text_vocab = vocabs['text'] program_vocab = vocabs['program'] def get_memory_values(features, raw_text, args): if args.pretrained_transformer.startswith( 'bert-') and args.pretrained_transformer.endswith('-uncased'): return utils.restore_feature_case(features, raw_text, tu) else: return features def get_text_schema_adjacency_matrix(text_features, s_M): schema_size = s_M.shape[0] text_size = len(text_features) full_size = schema_size + text_size M = ssp.lil_matrix((full_size, full_size), dtype=np.int) M[-schema_size:, -schema_size:] = s_M return M # sanity check ############################ query_oov = False denormalized = False schema_truncated = False token_restored = True ############################ # Text feature extraction and set program ground truth list if isinstance(example, Text2SQLExample): if args.pretrained_transformer: text_features = text_tokenize(example.text) text_tokens, token_starts, token_ends = get_memory_values( text_features, example.text, args) if not token_starts: token_restored = False else: text_tokens = text_tokenize(example.text, functional_tokens) text_features = [t.lower() for t in text_tokens] example.text_tokens = text_features example.text_ptr_values = text_tokens example.text_token_starts = token_starts example.text_token_ends = token_ends example.text_ids = vec.vectorize(text_features, text_vocab) example.text_ptr_input_ids = vec.vectorize(text_features, text_vocab) program_list = example.program_list example.values = [ (schema_graph.get_field(cond[0]).signature, cond[2]) for cond in example.program_ast_list_[0]['conds'] if (isinstance(cond[2], str) and not is_number(cond[2])) ] else: text_tokens = example.example.text_ptr_values text_features = example.example.text_tokens program_list = example.example.program_list # Schema feature extraction if args.model_id in [BRIDGE]: question_encoding = example.text if args.use_picklist else None tables = sorted([schema_graph.get_table_id(t_name) for t_name in example.gt_table_names]) \ if args.use_oracle_tables else None table_po, field_po = schema_graph.get_schema_perceived_order(tables) schema_features, matched_values = schema_graph.get_serialization( tu, flatten_features=True, table_po=table_po, field_po=field_po, use_typed_field_markers=args.use_typed_field_markers, use_graph_encoding=args.use_graph_encoding, question_encoding=question_encoding, top_k_matches=args.top_k_picklist_matches, num_values_per_field=args.num_values_per_field, no_anchor_text=args.no_anchor_text) example.matched_values = matched_values example.input_tokens, example.input_ptr_values, num_excluded_tables, num_excluded_fields = \ get_table_aware_transformer_encoder_inputs(text_tokens, text_features, schema_features, table_utils) schema_truncated = (num_excluded_fields > 0) num_included_nodes = schema_graph.get_num_perceived_nodes( table_po) + 1 - num_excluded_tables - num_excluded_fields example.ptr_input_ids = vec.vectorize(example.input_tokens, text_vocab) if args.read_picklist: example.transformer_output_value_mask, value_features, value_tokens = \ get_transformer_output_value_mask(example.input_tokens, matched_values, tu) example.primary_key_ids = schema_graph.get_primary_key_ids( num_included_nodes, table_po=table_po, field_po=field_po) example.foreign_key_ids = schema_graph.get_foreign_key_ids( num_included_nodes, table_po=table_po, field_po=field_po) example.field_type_ids = schema_graph.get_field_type_ids( num_included_nodes, table_po=table_po, field_po=field_po) example.table_masks = schema_graph.get_table_masks(num_included_nodes, table_po=table_po, field_po=field_po) example.field_table_pos = schema_graph.get_field_table_pos( num_included_nodes, table_po=table_po, field_po=field_po) example.schema_M = schema_graph.adj_matrix example.M = get_text_schema_adjacency_matrix(text_features, example.schema_M) else: num_included_nodes = schema_graph.num_nodes # Value copy feature extraction if args.read_picklist: constant_memory_features = text_features + value_features constant_memory = text_tokens + value_tokens example.text_ptr_values = constant_memory else: constant_memory_features = text_features constant_ptr_value_ids, constant_unique_input_ids = vec.vectorize_ptr_in( constant_memory_features, program_vocab) if isinstance(example, Text2SQLExample): example.text_ptr_value_ids = constant_ptr_value_ids example.ptr_value_ids = constant_ptr_value_ids + [ program_vocab.size + len(constant_memory_features) + x for x in range(num_included_nodes) ] if not args.leaderboard_submission: for j, program in enumerate(program_list): if isinstance(example, Text2SQLExample): # Model II. Bridge output program_singleton_field_tokens, program_singleton_field_token_types = \ tok.wikisql_struct_to_tokens(example.program_ast_, schema_graph, tu) program_singleton_field_tokens = [ START_TOKEN ] + program_singleton_field_tokens + [EOS_TOKEN] program_singleton_field_token_types = \ [RESERVED_TOKEN_TYPE] + program_singleton_field_token_types + [RESERVED_TOKEN_TYPE] example.program_singleton_field_tokens_list.append( program_singleton_field_tokens) example.program_singleton_field_token_types_list.append( program_singleton_field_token_types) program_singleton_field_input_ids = vec.vectorize_singleton( program_singleton_field_tokens, program_singleton_field_token_types, program_vocab) example.program_singleton_field_input_ids_list.append( program_singleton_field_input_ids) else: # Model II. Bridge output example.program_singleton_field_input_ids_list.append( example.example.program_singleton_field_input_ids_list[j]) program_singleton_field_tokens = example.example.program_singleton_field_tokens_list[ j] program_singleton_field_token_types = example.example.program_singleton_field_token_types_list[ j] program_field_ptr_value_ids = vec.vectorize_field_ptr_out( program_singleton_field_tokens, program_singleton_field_token_types, program_vocab, constant_unique_input_ids, max_memory_size=len(constant_memory_features), schema=schema_graph, num_included_nodes=num_included_nodes) example.program_text_and_field_ptr_value_ids_list.append( program_field_ptr_value_ids) table_ids = [ schema_graph.get_table_id(table_name) for table_name in example.gt_table_names_list[j] ] example.table_ids_list.append(table_ids) assert ([schema_graph.get_table(x).name for x in table_ids] == example.gt_table_names) # sanity check ############################ # NL+Schema pointer output contains tokens that does not belong to any of the following categories if verbose: if program_vocab.unk_id in program_field_ptr_value_ids: unk_indices = [ i for i, x in enumerate(program_field_ptr_value_ids) if x == program_vocab.unk_id ] print('OOV II: {}'.format(' '.join([ program_singleton_field_tokens[i] for i in unk_indices ]))) example.pretty_print( schema=schema_graph, de_vectorize_ptr=vec.de_vectorize_ptr, de_vectorize_field_ptr=vec.de_vectorize_field_ptr, rev_vocab=program_vocab, post_process=post_process, use_table_aware_te=(args.model_id in [BRIDGE])) query_oov = True if program_vocab.unk_field_id in program_field_ptr_value_ids: example.pretty_print( schema=schema_graph, de_vectorize_ptr=vec.de_vectorize_ptr, de_vectorize_field_ptr=vec.de_vectorize_field_ptr, rev_vocab=program_vocab, post_process=post_process, use_table_aware_te=(args.model_id in [BRIDGE])) if program_vocab.unk_table_id in program_field_ptr_value_ids: example.pretty_print( schema=schema_graph, de_vectorize_ptr=vec.de_vectorize_ptr, de_vectorize_field_ptr=vec.de_vectorize_field_ptr, rev_vocab=program_vocab, post_process=post_process, use_table_aware_te=(args.model_id in [BRIDGE])) ############################ # Store the ground truth queries after preprocessing to run a relaxed evaluation or # to evaluate with partial queries if split == 'dev': input_tokens = text_tokens if args.model_id in [BRIDGE]: _p = vec.de_vectorize_field_ptr( program_field_ptr_value_ids, program_vocab, input_tokens, schema=schema_graph, post_process=post_process) else: _p = program example.gt_program_list.append(_p) # sanity check ############################ # try: # assert(equal_ignoring_trivial_diffs(_p, program.lower(), verbose=True)) # except Exception: # print('_p:\t\t{}'.format(_p)) # print('program:\t{}'.format(program)) # print() # import pdb # pdb.set_trace() ############################ example.run_unit_tests() return query_oov, denormalized, schema_truncated, token_restored