def create_features(env, program_tks, max_n_exp): """Create features to be used in the score function.""" # Unique decoder variables used in the program variables = set( [tk for tk in program_tks if tk[0] == 'v' and tk[1:].isdigit()]) ns = env.interpreter.namespace # Variables already present in the namespace i.e. entities and column names ns_vars = [tk for tk in variables if tk in ns] str_vals = [unicode(ns[tk]['value']) for tk in ns_vars] vars_to_val = dict(zip(ns_vars, str_vals)) # Tokens which represent entities val_ent = env.ent_props['val_ent'] ent_vars = [tk for tk, val in vars_to_val.iteritems() if val in val_ent] ents_index = [val_ent[vars_to_val[tk]][0] for tk in ent_vars] entities = [env.entities[i] for i in ents_index] sum_entity_length = { 'datetime_list': 0.0, 'string_list': 0.0, 'num_list': 0.0 } ent_lengths = [e['length'] for e in entities] for ent, w in zip(entities, ent_lengths): sum_entity_length[ent['type']] += w # Feature vector feature_size = 12 if env.trigger_words_dict else 11 f = np.zeros(feature_size) # Entity features ent_sum = sum(sum_entity_length.values()) ques_ent_sum = sum(env.ent_props['sum_entity_length'].values()) # No of fractional entities where each entity is weighted by its length. f[0] = ent_sum / (ques_ent_sum + EPS) # No of fractional entities for each entity type weighted by its length f[1:4] = [ v / (env.ent_props['sum_entity_length'][k] + EPS) for k, v in sum_entity_length.iteritems() ] # No of fractional entities f[4] = len(ent_vars) / (len(env.entities) + EPS) # Feature that the longest entity is present in the program or not, if there # is than one entity that has the maximum length, this feature # represents the fraction of entities present in the program that has the # max length. max_len = max(ent_lengths) if ent_lengths else 0.0 if max_len == env.ent_props['max_length']: max_ent_sum = data_utils.max_sum(ent_lengths) f[5] = max_ent_sum / (env.ent_props['max_sum'] + EPS) # Column features # Only consider columns represnting differing things, for example, # year-date and year-string are considered the same column column_vars, column_vals = [], set() for v in ns_vars: if v not in ent_vars: str_val = vars_to_val[v].split('-')[0] if str_val not in column_vals: column_vars.append(v) column_vals.add(str_val) cols = env.de_vocab.lookup(column_vars) col_features = [env.id_feature_dict[col][0] for col in cols] col_sum, ques_col_sum = sum(col_features), env.col_props['sum'] f[6] = col_sum / (ques_col_sum + EPS) f[7] = sum([(i > 0) for i in col_features]) / (env.col_props['num'] + EPS) max_w = max(col_features) if col_features else 0.0 if max_w == env.col_props['max']: f[8] = data_utils.max_sum(col_features) / (env.col_props['max_sum'] + EPS) # (1 - n) where n = fractional number of expressions in the program num_exp = program_tks.count('(') f[9] = 1.0 - (num_exp / max_n_exp) # Function tokens if env.trigger_words_dict: fn_score = 0.0 fn_tks = [tk for tk in program_tks if tk in env.trigger_words_dict] if fn_tks: for tk in fn_tks: if env.ques_tokens & env.trigger_words_dict[tk]: fn_score += 1.0 fn_score /= len(fn_tks) f[10] = fn_score # Approximate set interection similarity feature denominator = ((2 * ques_ent_sum + ques_col_sum) * num_exp) f[-1] = (2 * ent_sum + col_sum) / (denominator + EPS) return f
def __init__(self, en_vocab, de_vocab, question_annotation, answer, constant_value_embedding_fn, score_fn, interpreter, constants=None, punish_extra_work=True, init_interp=True, trigger_words_dict=None, max_cache_size=1e4, name='qa_programming'): self.name = name self.en_vocab = en_vocab self.de_vocab = de_vocab self.end_action = self.de_vocab.end_id self.score_fn = score_fn self.interpreter = interpreter self.answer = answer self.question_annotation = question_annotation self.constant_value_embedding_fn = constant_value_embedding_fn self.constants = constants self.punish_extra_work = punish_extra_work self.error = False self.trigger_words_dict = trigger_words_dict tokens = question_annotation['tokens'] if 'pos_tags' in question_annotation: self.ques_tokens = set(tokens + question_annotation['pos_tags']) else: self.ques_tokens = set(tokens) en_inputs = en_vocab.lookup(tokens) self.n_builtin = len(de_vocab.vocab) - interpreter.max_mem self.n_mem = interpreter.max_mem self.n_exp = interpreter.max_n_exp max_n_constants = self.n_mem - self.n_exp constant_spans = [] constant_values = [] if constants is None: constants = [] for c in constants: constant_spans.append([-1, -1]) constant_values.append(c['value']) if init_interp: self.interpreter.add_constant(value=c['value'], type=c['type']) for entity in question_annotation['entities']: # Use encoder output at start and end (inclusive) step # to create span embedding. constant_spans.append( [entity['token_start'], entity['token_end'] - 1]) constant_values.append(entity['value']) if init_interp: self.interpreter.add_constant(value=entity['value'], type=entity['type']) constant_value_embeddings = [ constant_value_embedding_fn(value) for value in constant_values ] if len(constant_values) > (self.n_mem - self.n_exp): tf.logging.info( 'Not enough memory slots for example {}, which has {} constants.' .format(self.name, len(constant_values))) constant_spans = constant_spans[:max_n_constants] constant_value_embeddings = constant_value_embeddings[:max_n_constants] self.context = (en_inputs, constant_spans, constant_value_embeddings, question_annotation['features'], question_annotation['tokens']) # Create output features. prop_features = question_annotation['prop_features'] self.id_feature_dict = {} for name, id in de_vocab.vocab.iteritems(): self.id_feature_dict[id] = [0] if name in self.interpreter.namespace: val = self.interpreter.namespace[name]['value'] if ((isinstance(val, str) or isinstance(val, unicode)) and val in prop_features): self.id_feature_dict[id] = prop_features[val] # Create features to make calculation of score function easy entities = question_annotation['entities'] for e in entities: if e['type'] != 'datetime_list': e['length'] = e['token_end'] - e['token_start'] else: # For datetime entities, either token_end or token_start is incorrect, # so need to look at the entity itself for calculating the length # Also, we shouldn't consider 'xxxx' or 'xx' while calculating the # entity length e['length'] = len([ x for x in e['value'][0].replace('x', '').split('-') if x ]) entity_lengths = [e['length'] for e in entities] max_entity_length = max(entity_lengths) if entity_lengths else 0.0 max_entity_sum = data_utils.max_sum(entity_lengths) sum_entity_length = { 'datetime_list': 0, 'string_list': 0, 'num_list': 0 } for e, w in zip(entities, entity_lengths): sum_entity_length[e['type']] += w self.entities = entities self.ent_props = dict(max_sum=max_entity_sum, max_length=max_entity_length, val_ent=value_to_index(entities), sum_entity_length=sum_entity_length) col_features = [v[0] for v in self.id_feature_dict.values()] self.col_props = dict(sum=sum(col_features), max=max(col_features) if col_features else 0.0, max_sum=data_utils.max_sum(col_features), num=sum([i > 0 for i in col_features])) self.cache = SearchCache(name=name, max_elements=max_cache_size) self.use_cache = False self.reset()