Exemplo n.º 1
0
def create_features(env, program_tks, max_n_exp):
    """Create features to be used in the score function."""
    # Unique decoder variables used in the program
    variables = set(
        [tk for tk in program_tks if tk[0] == 'v' and tk[1:].isdigit()])
    ns = env.interpreter.namespace
    # Variables already present in the namespace i.e. entities and column names
    ns_vars = [tk for tk in variables if tk in ns]
    str_vals = [unicode(ns[tk]['value']) for tk in ns_vars]
    vars_to_val = dict(zip(ns_vars, str_vals))

    # Tokens which represent entities
    val_ent = env.ent_props['val_ent']
    ent_vars = [tk for tk, val in vars_to_val.iteritems() if val in val_ent]
    ents_index = [val_ent[vars_to_val[tk]][0] for tk in ent_vars]
    entities = [env.entities[i] for i in ents_index]
    sum_entity_length = {
        'datetime_list': 0.0,
        'string_list': 0.0,
        'num_list': 0.0
    }
    ent_lengths = [e['length'] for e in entities]
    for ent, w in zip(entities, ent_lengths):
        sum_entity_length[ent['type']] += w

    # Feature vector
    feature_size = 12 if env.trigger_words_dict else 11
    f = np.zeros(feature_size)
    # Entity features
    ent_sum = sum(sum_entity_length.values())
    ques_ent_sum = sum(env.ent_props['sum_entity_length'].values())
    # No of fractional entities where each entity is weighted by its length.
    f[0] = ent_sum / (ques_ent_sum + EPS)
    # No of fractional entities for each entity type weighted by its length
    f[1:4] = [
        v / (env.ent_props['sum_entity_length'][k] + EPS)
        for k, v in sum_entity_length.iteritems()
    ]
    # No of fractional entities
    f[4] = len(ent_vars) / (len(env.entities) + EPS)

    # Feature that the longest entity is present in the program or not, if there
    # is than one entity that has the maximum length, this feature
    # represents the fraction of entities present in the program that has the
    # max length.
    max_len = max(ent_lengths) if ent_lengths else 0.0
    if max_len == env.ent_props['max_length']:
        max_ent_sum = data_utils.max_sum(ent_lengths)
        f[5] = max_ent_sum / (env.ent_props['max_sum'] + EPS)

    # Column features
    # Only consider columns represnting differing things, for example,
    # year-date and year-string are considered the same column
    column_vars, column_vals = [], set()
    for v in ns_vars:
        if v not in ent_vars:
            str_val = vars_to_val[v].split('-')[0]
            if str_val not in column_vals:
                column_vars.append(v)
                column_vals.add(str_val)
    cols = env.de_vocab.lookup(column_vars)
    col_features = [env.id_feature_dict[col][0] for col in cols]
    col_sum, ques_col_sum = sum(col_features), env.col_props['sum']
    f[6] = col_sum / (ques_col_sum + EPS)
    f[7] = sum([(i > 0) for i in col_features]) / (env.col_props['num'] + EPS)
    max_w = max(col_features) if col_features else 0.0
    if max_w == env.col_props['max']:
        f[8] = data_utils.max_sum(col_features) / (env.col_props['max_sum'] +
                                                   EPS)

    # (1 - n) where n = fractional number of expressions in the program
    num_exp = program_tks.count('(')
    f[9] = 1.0 - (num_exp / max_n_exp)

    # Function tokens
    if env.trigger_words_dict:
        fn_score = 0.0
        fn_tks = [tk for tk in program_tks if tk in env.trigger_words_dict]
        if fn_tks:
            for tk in fn_tks:
                if env.ques_tokens & env.trigger_words_dict[tk]:
                    fn_score += 1.0
            fn_score /= len(fn_tks)
        f[10] = fn_score

    # Approximate set interection similarity feature
    denominator = ((2 * ques_ent_sum + ques_col_sum) * num_exp)
    f[-1] = (2 * ent_sum + col_sum) / (denominator + EPS)
    return f
Exemplo n.º 2
0
    def __init__(self,
                 en_vocab,
                 de_vocab,
                 question_annotation,
                 answer,
                 constant_value_embedding_fn,
                 score_fn,
                 interpreter,
                 constants=None,
                 punish_extra_work=True,
                 init_interp=True,
                 trigger_words_dict=None,
                 max_cache_size=1e4,
                 name='qa_programming'):
        self.name = name
        self.en_vocab = en_vocab
        self.de_vocab = de_vocab
        self.end_action = self.de_vocab.end_id
        self.score_fn = score_fn
        self.interpreter = interpreter
        self.answer = answer
        self.question_annotation = question_annotation
        self.constant_value_embedding_fn = constant_value_embedding_fn
        self.constants = constants
        self.punish_extra_work = punish_extra_work
        self.error = False
        self.trigger_words_dict = trigger_words_dict
        tokens = question_annotation['tokens']
        if 'pos_tags' in question_annotation:
            self.ques_tokens = set(tokens + question_annotation['pos_tags'])
        else:
            self.ques_tokens = set(tokens)

        en_inputs = en_vocab.lookup(tokens)
        self.n_builtin = len(de_vocab.vocab) - interpreter.max_mem
        self.n_mem = interpreter.max_mem
        self.n_exp = interpreter.max_n_exp
        max_n_constants = self.n_mem - self.n_exp

        constant_spans = []
        constant_values = []
        if constants is None:
            constants = []
        for c in constants:
            constant_spans.append([-1, -1])
            constant_values.append(c['value'])
            if init_interp:
                self.interpreter.add_constant(value=c['value'], type=c['type'])

        for entity in question_annotation['entities']:
            # Use encoder output at start and end (inclusive) step
            # to create span embedding.
            constant_spans.append(
                [entity['token_start'], entity['token_end'] - 1])
            constant_values.append(entity['value'])
            if init_interp:
                self.interpreter.add_constant(value=entity['value'],
                                              type=entity['type'])

        constant_value_embeddings = [
            constant_value_embedding_fn(value) for value in constant_values
        ]

        if len(constant_values) > (self.n_mem - self.n_exp):
            tf.logging.info(
                'Not enough memory slots for example {}, which has {} constants.'
                .format(self.name, len(constant_values)))

        constant_spans = constant_spans[:max_n_constants]
        constant_value_embeddings = constant_value_embeddings[:max_n_constants]
        self.context = (en_inputs, constant_spans, constant_value_embeddings,
                        question_annotation['features'],
                        question_annotation['tokens'])

        # Create output features.
        prop_features = question_annotation['prop_features']
        self.id_feature_dict = {}
        for name, id in de_vocab.vocab.iteritems():
            self.id_feature_dict[id] = [0]
            if name in self.interpreter.namespace:
                val = self.interpreter.namespace[name]['value']
                if ((isinstance(val, str) or isinstance(val, unicode))
                        and val in prop_features):
                    self.id_feature_dict[id] = prop_features[val]

        # Create features to make calculation of score function easy
        entities = question_annotation['entities']
        for e in entities:
            if e['type'] != 'datetime_list':
                e['length'] = e['token_end'] - e['token_start']
            else:
                # For datetime entities, either token_end or token_start is incorrect,
                # so need to look at the entity itself for calculating the length
                # Also, we shouldn't consider 'xxxx' or 'xx' while calculating the
                # entity length
                e['length'] = len([
                    x for x in e['value'][0].replace('x', '').split('-') if x
                ])
        entity_lengths = [e['length'] for e in entities]
        max_entity_length = max(entity_lengths) if entity_lengths else 0.0
        max_entity_sum = data_utils.max_sum(entity_lengths)
        sum_entity_length = {
            'datetime_list': 0,
            'string_list': 0,
            'num_list': 0
        }
        for e, w in zip(entities, entity_lengths):
            sum_entity_length[e['type']] += w

        self.entities = entities
        self.ent_props = dict(max_sum=max_entity_sum,
                              max_length=max_entity_length,
                              val_ent=value_to_index(entities),
                              sum_entity_length=sum_entity_length)
        col_features = [v[0] for v in self.id_feature_dict.values()]
        self.col_props = dict(sum=sum(col_features),
                              max=max(col_features) if col_features else 0.0,
                              max_sum=data_utils.max_sum(col_features),
                              num=sum([i > 0 for i in col_features]))

        self.cache = SearchCache(name=name, max_elements=max_cache_size)
        self.use_cache = False
        self.reset()