Exemplo n.º 1
0
 def aggregate(self, predicted_begins, predicted_ends, contexts_text,
               q_ids):
     batch_size = predicted_begins.shape[0]
     for i in range(batch_size):
         predicted_answer = detokenize(
             map(vec2str,
                 contexts_text[i][predicted_begins[i]:predicted_ends[i]]))
         q_id = vec2str(q_ids[i])
         self.predictions[q_id] = predicted_answer
Exemplo n.º 2
0
    def retrieve(self, batch):
        """Retrieves all definitions for a batch of words sequences.

        TODO: definitions of phrases, phrasal verbs, etc.

        Returns
        -------
        defs
            A list of word definitions, each definition is a list of words.
        def_map
            A list of triples (batch_index, time_step, def_index). Maps
            words to their respective definitions from `defs`.

        """
        definitions = []
        def_map = []
        word_def_indices = {}

        for seq_pos, sequence in enumerate(batch):
            for word_pos, word in enumerate(sequence):
                if isinstance(word, numpy.ndarray):
                    word = vec2str(word)
                if not word:
                    continue
                self._debug_info['N_words'] += 1
                word_id = self._vocab_text.word_to_id(word)
                if (self._exclude_top_k and word_id != self._vocab_text.unk
                        and word_id < self._exclude_top_k):
                    self._debug_info['N_excluded_words'] += 1
                    continue

                if word not in word_def_indices:
                    word_def_indices[word] = []
                    # The first time a word is encountered in a batch
                    word_defs = self._dictionary.get_definitions(word)

                    if self._max_def_per_word < len(word_defs):
                        if self._with_too_many_defs == 'random':
                            word_defs = self._rng.choice(
                                word_defs,
                                self._max_def_per_word,
                                replace=False)
                        else:
                            # (rizar): if there's too many definition for a words,
                            # maybe let's just accept that it's a "semantic prime"?
                            word_defs = []

                    # Debug info
                    self._debug_info['N_distinct_words'] += 1
                    self._debug_info['N_missed_distinct_words'] += (
                        len(word_defs) == 0)
                    # End of debug info

                    for i, def_ in enumerate(word_defs):
                        self._debug_info['N_def'] += 1

                        if self._with_too_long_defs == 'drop':
                            if len(def_) > self._max_def_length:
                                self._debug_info['N_dropped_def'] += 1
                                continue
                        elif self._with_too_long_defs == 'crop':
                            def_ = def_[0:self._max_def_length]
                        else:
                            raise NotImplementedError()

                        final_def_ = []
                        if self._add_bod_eod:
                            final_def_.append(self._vocab_def.bod)
                        for token in def_:
                            final_def_.append(
                                self._vocab_def.word_to_id(token))
                        if self._add_bod_eod:
                            final_def_.append(self._vocab_def.eod)
                        word_def_indices[word].append(len(definitions))
                        definitions.append(final_def_)

                # Debug info
                self._debug_info['N_queried_words'] += 1
                if len(word_def_indices[word]) == 0:
                    self._debug_info['N_missed_words'] += 1
                    if len(self._debug_info['missed_word_sample']) == 10000:
                        self._debug_info['missed_word_sample'][
                            numpy.random.randint(10000)] = word
                    else:
                        self._debug_info['missed_word_sample'].append(word)
                # End of debug info

                for def_index in word_def_indices[word]:
                    def_map.append((seq_pos, word_pos, def_index))

        return definitions, def_map
Exemplo n.º 3
0
def test_vec2str():
    vector = map(ord, 'abc') + [0, 0]
    assert vec2str(vector) == 'abc'
Exemplo n.º 4
0
 def perform(self, node, inputs, output_storage):
     words = inputs[0]
     words_flat = words.reshape(-1, words.shape[-1])
     word_counts = numpy.array(
         [self._vocab.word_freq(vec2str(word)) for word in words_flat])
     output_storage[0][0] = word_counts.reshape(words.shape[:-1])
Exemplo n.º 5
0
def generate_embeddings(config,
                        tar_path,
                        part,
                        dest_path,
                        format_,
                        average=False,
                        encoder_embeddings=None,
                        **kwargs):
    """
    generate embeddings for all the defintions, average them and serialize OR
    if encoder_embeddings, serialize the models' encoder embeddings

    config: name of the config of the model
    tar_path: tar path of the model parameters
    part: part of the dataset (should be either 'train', 'valid', 'test' or 'all')
    dest_path: directory where the serialized embeddings will be written
    format: either 'dict' or 'glove'
    encoder_embeddings: None, 'only', 'mixed', 'if_missing'
      - None: don't include encoder embeddings
      - 'only': don't read any data, just serialize the encoder embeddings
      - 'mixed': add the encoder embeddings to the list of definition embeddings
      - 'if_missing': add the encoder embeddings when there is no corresponding def
    average: if true, multi-prototype embeddings will be averaged
    """
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)

    c = config
    data, model = initialize_data_and_model(c, train_phase=False)
    words = T.ltensor3('words')
    words_mask = T.matrix('words_mask')
    keys = T.lmatrix('keys')
    n_identical_keys = T.lvector('n_identical_keys')
    sym_args = [words, words_mask]

    if format_ not in ['dict', 'glove']:
        raise ValueError("format should be either: dict, glove")

    if not c['encoder'] and encoder_embeddings != 'only':
        raise ValueError('Error: this model does not have an encoder.')

    if use_keys(c):
        sym_args.append(keys)
    if use_n_identical_keys(c):
        sym_args.append(n_identical_keys)

    costs = model.apply(*sym_args, train_phase=False)

    cg = Model(costs)

    with open(tar_path) as src:
        cg.set_parameter_values(load_parameters(src))

    if encoder_embeddings:
        if encoder_embeddings == 'only' and not c['encoder']:
            embeddings_array = model.get_def_embeddings_params('key').eval()
        else:
            embeddings_array = model.get_def_embeddings_params('main').eval()
        entries = model.get_embeddings_entries()
        enc_embeddings = {
            e: np.asarray(a)
            for e, a in zip(entries, embeddings_array)
        }
        if encoder_embeddings == 'only':
            serialize_embeddings(enc_embeddings, format_, dest_path,
                                 "encoder_embeddings")
            return 0

    embeddings_var, = VariableFilter(name='embeddings')(cg)
    compute = dict({"embeddings": embeddings_var})
    if c['proximity_coef'] != 0:
        prox_var, = VariableFilter(name='proximity_term')(cg)
        compute["proximity_term"] = prox_var
    print "sym args", sym_args
    predict_f = theano.function(sym_args, compute)
    batch_size = 256  # size of test_unseen
    stream = data.get_stream(part,
                             batch_size=batch_size,
                             max_length=c['max_length'],
                             remove_keys=False,
                             remove_n_identical_keys=False)
    raw_data = []  # list of dicts containing the inputs and computed outputs
    i = 0
    vocab = model._vocab
    print "start computing"
    embeddings = defaultdict(list)
    for input_data in stream.get_epoch_iterator(as_dict=True):
        if i % 10 == 0:
            print "iteration:", i
        words = input_data['words']
        words_mask = input_data['words_mask']
        keys = input_data['keys']
        n_identical_keys = input_data['n_identical_keys']
        args = [words, words_mask]
        if use_keys(c):
            args.append(keys)
        if use_n_identical_keys(c):
            args.append(n_identical_keys)

        to_save = predict_f(*args)
        for k, h in zip(keys, to_save['embeddings']):
            key = vec2str(k)
            if encoder_embeddings == 'if_missing':
                try:
                    del enc_embeddings[key]
                except KeyError:
                    pass
            embeddings[key].append(h)
        i += 1

    if encoder_embeddings in ['mixed', 'if_missing']:
        for k, e in enc_embeddings.iteritems():
            embeddings[k].append(e)

    if encoder_embeddings == 'mixed':
        prefix_fname = 'mix_e_'
    elif encoder_embeddings == 'if_missing':
        prefix_fname = 'if_mis_e_'
    else:
        prefix_fname = ''

    # combine:
    if average:
        mean_embeddings = {}
        for k in embeddings.keys():
            mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0)
        serialize_embeddings(mean_embeddings, format_, dest_path,
                             prefix_fname + "mean_embeddings")
    else:
        serialize_embeddings(embeddings, format_, dest_path,
                             prefix_fname + "embeddings")
Exemplo n.º 6
0
def evaluate_extractive_qa(config,
                           tar_path,
                           part,
                           num_examples,
                           dest_path,
                           qids=None,
                           dataset=None):
    if not dest_path:
        dest_path = os.path.join(os.path.dirname(tar_path), 'predictions.json')
    log_path = os.path.splitext(dest_path)[0] + '_log.json'

    if qids:
        qids = qids.split(',')

    if dataset:
        dataset = SQuADDataset(dataset, ('all', ))

    c = config
    data, qam = initialize_data_and_model(c)
    costs = qam.apply_with_default_vars()
    cg = Model(costs)

    with open(tar_path) as src:
        cg.set_parameter_values(load_parameters(src))

    predicted_begins, = VariableFilter(name='predicted_begins')(cg)
    predicted_ends, = VariableFilter(name='predicted_ends')(cg)
    compute = {'begins': predicted_begins, 'ends': predicted_ends}
    if c['coattention']:
        d2q_att_weights, = VariableFilter(name='d2q_att_weights')(cg)
        q2d_att_weights, = VariableFilter(name='q2d_att_weights')(cg)
        compute.update({'d2q': d2q_att_weights, 'q2d': q2d_att_weights})
    compute['costs'] = costs
    predict_func = theano.function(qam.input_vars.values(), compute)
    logger.debug("Ready to evaluate")

    done_examples = 0
    num_correct = 0

    def print_stats():
        print('EXACT MATCH RATIO: {}'.format(num_correct /
                                             float(done_examples)))

    predictions = {}
    log = {}

    stream = data.get_stream(part,
                             batch_size=1,
                             shuffle=part == 'train',
                             raw_text=True,
                             q_ids=True,
                             dataset=dataset)
    for example in stream.get_epoch_iterator(as_dict=True):
        if done_examples == num_examples:
            break
        q_id = vec2str(example['q_ids'][0])
        if qids and not q_id in qids:
            continue

        example['contexts_text'] = [map(vec2str, example['contexts_text'][0])]
        example['questions_text'] = [
            map(vec2str, example['questions_text'][0])
        ]
        feed = dict(example)
        del feed['q_ids']
        del feed['contexts_text']
        del feed['questions_text']
        del feed['contexts_text_mask']
        result = predict_func(**feed)
        correct_answer_span = slice(example['answer_begins'][0],
                                    example['answer_ends'][0])
        predicted_answer_span = slice(result['begins'][0], result['ends'][0])
        correct_answer = example['contexts_text'][0][correct_answer_span]
        answer = example['contexts_text'][0][predicted_answer_span]
        is_correct = correct_answer_span == predicted_answer_span
        context = example['contexts_text'][0]
        question = example['questions_text'][0]
        context_def_map = example['contexts_def_map']

        # pretty print
        outcome = 'correct' if is_correct else 'wrong'
        print('#{}'.format(done_examples))
        print(u"CONTEXT:", detokenize(context))
        print(u"QUESTION:", detokenize(question))
        print(u"RIGHT ANSWER: {}".format(detokenize(correct_answer)))
        print(
            u"ANSWER (span=[{}, {}], {}):".format(predicted_answer_span.start,
                                                  predicted_answer_span.stop,
                                                  outcome), detokenize(answer))
        print(u"COST: {}".format(float(result['costs'][0])))
        print(u"DEFINITIONS AVAILABLE FOR:")
        for pos in set(context_def_map[:, 1]):
            print(context[pos])
        print()

        # update statistics
        done_examples += 1
        num_correct += is_correct

        # save the results
        predictions[q_id] = detokenize(answer)
        log_entry = {
            'context': context,
            'question': question,
            'answer': answer,
            'correct_answer': correct_answer,
            'cost': float(result['costs'][0])
        }
        if c['coattention']:
            log_entry['d2q'] = cPickle.dumps(result['d2q'][0])
            log_entry['q2d'] = cPickle.dumps(result['q2d'][0])
        log[q_id] = log_entry

        if done_examples % 100 == 0:
            print_stats()
    print_stats()

    with open(log_path, 'w') as dst:
        json.dump(log, dst, indent=2, sort_keys=True)
    with open(dest_path, 'w') as dst:
        json.dump(predictions, dst, indent=2, sort_keys=True)